In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..')) # or the path to your source code
sys.path.insert(0, module_path)

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re


In [4]:
from nlp_project.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
from nlp_project.plots import sb_bar_plot,plotly_ex_barplot 
from nlp_project.dataset import preprocess

# PRE-PROCESSING

In [None]:
dataset = pd.read_csv(PROCESSED_DATA_DIR / 'cell_phones_reviews.csv')
dataset.head()

ESPLORIAMO IL DATASET

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

Rimozione righe con valori null per prezzo e reviewText

In [None]:
dataset.dropna(subset=['price', 'reviewText'], inplace=True)

In [None]:
# regex per il formato corretto dei prezzi
regex = re.compile(r'^\$\d+\.\d+$')

# Filtriamo i dati applicando la regex alla colonna 'price'
dataset = dataset[dataset['price'].apply(lambda x: bool(regex.match(x)))]

In [None]:
dataset.reset_index(drop=True, inplace=True)
dataset.head()

In [None]:
# Rimuoviamo il simbolo del dollaro dalla colonna 'prezzi'
dataset['price'] = dataset['price'].str.replace('$', '', regex=False).astype(float)

In [None]:
print(dataset.shape)
dataset.isnull().sum()

Quanti cellulari diversi sono rimasti dopo aver rimosso le reviews 

In [None]:
len(dataset['asin'].unique())

In [None]:
reviews_count_per_product = dataset.groupby('asin').size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [None]:
reviews_count_per_product = reviews_count_per_product.merge(
    dataset[['asin', 'title']].drop_duplicates(),
    on='asin',
    how='left'
)

In [None]:
reviews_count_per_product

In [None]:
less_100_reviews = reviews_count_per_product[reviews_count_per_product['counts'] < 100]['asin']

In [None]:
plotly_ex_barplot(data=reviews_count_per_product[:50], x='title', y='counts', color='counts', height=400)


In [None]:
reviews_count_per_product = reviews_count_per_product[reviews_count_per_product['counts'] >= 100]

In [None]:
reviews_count_per_product.to_csv(INTERIM_DATA_DIR / 'reviews_count_per_product.csv', index=False, sep=';')

In [None]:
reviews_count_per_product[reviews_count_per_product["title"] ==
"Samsung Galaxy S3 Mini GT-i8190 GSM Unlocked International Version White - NO WARRANTY"]["asin"]

reviews relative ai prodotti aventi meno di 100 reviews, che vengono rimosse

In [None]:
len(dataset[dataset['asin'].isin(less_100_reviews)])

In [None]:
dataset = dataset[~dataset['asin'].isin(less_100_reviews)]
dataset

Data Preprocessing

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('universal_tagset')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Per risultati più consistenti con langdetect
DetectorFactory.seed = 0

# Funzione per rilevare la lingua
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [None]:
%%time
# Rileva la lingua di ogni recensione
dataset['language'] = dataset['reviewText'].apply(detect_language)

In [None]:
dataset = dataset[~dataset['language'].isin(['es', 'pt'])]

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
stop_words

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# from tqdm import tqdm
# from afinn import Afinn

# afinn = Afinn()
# def preprocess(df, column):
#     preprocessed_text = []
#     tagged_text = []
#     for row in tqdm(df[column], total = len(df[column])):
#         afinn_scores = []
#         text_cleaned = re.sub(r'[^\w\s]', '', row) #cleaning
#         words = nltk.word_tokenize(text_cleaned) #tokenization
#         words = [w.lower() for w in words]
#         preprocessed_words = [lemmatizer.lemmatize(w) for w in words if not w in stop_words] #stopwords removal and lemmatization
#         preprocessed_text.append(' '.join(preprocessed_words))
#         tagged_words = nltk.pos_tag(preprocessed_words, tagset='universal') #POS-tagging
#         #afinn scores
#         for word, tag in tagged_words:
#             score = afinn.score(word)
#             afinn_scores.append((word, tag, score))
#         tagged_text.append(afinn_scores)

#     return preprocessed_text, tagged_text

In [None]:
from tqdm import tqdm
from afinn import Afinn

afinn = Afinn()
def preprocess(df, column):
    preprocessed_text = []
    tagged_text = []
    for row in tqdm(df[column], total = len(df[column])):
        afinn_scores = []
        text_cleaned = re.sub(r'[^\w\s]', '', row) #cleaning: remove all character that aren't whitespaces or alphanumeric
        words = nltk.word_tokenize(text_cleaned) #tokenization
        words = [w.lower() for w in words] #lower casing
        tagged_words = nltk.pos_tag(words, tagset='universal') #POS-tagging
        preprocessed_words = [(lemmatizer.lemmatize(w), tag) for w, tag in tagged_words if not w in stop_words] #stopwords removal and lemmatization
        preprocessed_text.append(' '.join([w for w, tag in preprocessed_words]))
        #afinn scores
        for word, tag in preprocessed_words:
            score = afinn.score(word)
            afinn_scores.append((word, tag, score))
        tagged_text.append(afinn_scores)

    return preprocessed_text, tagged_text

In [None]:
dataset['preprocessed_text'], dataset['tagged_text'] = preprocess(dataset, 'reviewText')

In [None]:
dataset.head()

In [None]:
#dataset['reviewText'].iloc[1000]

In [None]:
dataset.drop(columns=['feature', 'language'], inplace=True)

In [None]:
dataset.to_csv(PROCESSED_DATA_DIR / 'preprocessed_dataset.csv', index=False, sep=';')

In [None]:
cell_phones_brand_counts = dataset['brand'].value_counts().reset_index()
cell_phones_brand_counts.columns = ['brand', 'count'] 
sb_bar_plot(x = cell_phones_brand_counts['count'],
            y = cell_phones_brand_counts['brand'],
            orient='h',
            title='top 10 brand', 
            xlabel='Count', 
            ylabel='Brand')


In [None]:
cell_phones_brand_counts