In [None]:
import requests
import gzip
import shutil
import pandas as pd
import pycld2
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url="https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/goodreads_reviews_fantasy_paranormal.json.gz"

In [None]:
local_filename = 'your_data.json.gz'

# Download the gzipped file
response = requests.get(url)

# Save the gzipped content to a local file
with open(local_filename, 'wb') as f:
    f.write(response.content)



print(f'The file {local_filename} has been downloaded.')

In [None]:
reviews = pd.read_json('/content/your_data.json.gz', lines = True,  compression="gzip")

# Preprocessing

In [None]:
data=reviews.dropna()

In [None]:
#Add the "sentiment" column
def sentiment(vote):
  if vote < 3:
    return "NEG"
  else:
    return "POS"

data["sentiment"] = data['rating'].apply(sentiment)

In [None]:
# Function to detect the language of a text
def detect_language(text):
    try:
        isReliable, textBytesFound, details, vectors = pycld2.detect(text, returnVectors=True)

        return vectors[0][3]
    except:
        return 'unknown'

In [None]:
# Apply language detection to the "review_text" column and create a new column 'language'
data['language'] = data['review_text'].apply(detect_language)

# Filter out non-English rows
data_filtered = data[data['language'] == 'en']

# Drop the temporary 'language' column if you no longer need it
data_filtered.drop('language', axis=1, inplace=True)

# Now, 'data_filtered' contains only English strings in the "review_text" column


In [None]:
df=data_filtered[["review_text","rating","sentiment"]]

# Undersampling

In [None]:
# Random Under Sampling (RUS)
df_bal = (df.groupby('sentiment', as_index = False).apply(lambda x: x.sample(n = 50000 )).reset_index(drop=True))

In [None]:
print('Before sampling: ' + str(df.shape[0]) + ' obs\n' +
      'After sampling: ' + str(df_bal.shape[0]) + ' obs')

## Text cleaning

In [None]:
import string
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
lemmatizer = WordNetLemmatizer()

# function to remove all the numbers from the text
def remove_numbers(text_to_preprocess):
    return re.sub(r'\d+', '', text_to_preprocess)

# function to remove all the punctuation marks from the text
def remove_punctuation(text):
    return text[0].translate(str.maketrans('', '', string.punctuation))

# function to remove all the stopwords from the text
def remove_stopwords(text):
    no_stopwords = ''
    for item in text.split():
        if item not in stopwords.words():
            no_stopwords+=' '+item
    return no_stopwords

# function to remove extra whitespaces from the text
def remove_extra_whitespace(text):
    return " ".join(text.split())

# function to tokenize the text into words
def tokenizer(text):
    return word_tokenize(text)

# function to lemmatize the tokenized words
def lemmatizer_function(tokenized_text):
    lemmatized_text= ''
    for token in tokenized_text:
        lemmatized = lemmatizer.lemmatize(token)
        lemmatized_text += ' '+lemmatized
    return lemmatized_text


# function to preprocess the text by lowercasing, removing numbers, punctuation, stopwords, extra whitespaces and lemmatizing
def preprocess_text(text):
    text = text.lower()
    no_nums = remove_numbers(text),
    no_punct = remove_punctuation(no_nums)
    no_stopw = remove_stopwords(no_punct)
    no_whtspace = remove_extra_whitespace(no_stopw)
    tokenized = tokenizer(no_whtspace)
    lemmatized = lemmatizer_function(tokenized)
    return lemmatized

In [None]:
def preprocess_loader(dataframe):
    tqdm.pandas()
    dataframe['preprocessed_text'] = dataframe['review_text'].apply(preprocess_text)
    return dataframe

In [None]:
from multiprocessing.pool import ThreadPool as Pool
from threading import Lock
import numpy as np

if __name__ == '__main__':
    train_ds = df_bal
    df_split = np.array_split(train_ds, 100000)
    pool = Pool(16)
    results = tqdm(pool.imap(preprocess_loader, df_split),
                   total=len(train_ds))
    reviews = pd.concat(results)
    pool.close()
    pool.join()

In [None]:
# Controllare valori nulli dopo il Text Cleaning
reviews[reviews['preprocessed_text'].isna()]

In [None]:
reviews = reviews.dropna()

In [None]:
#Controllare bilanciamento
score_grp = reviews.groupby('sentiment')['rating'].count()
score_grp

In [None]:
#Risetto l'indice del dataframe
reviews.reset_index(drop=True, inplace=True)
#salvo il dataset
reviews.to_csv("final.csv")

# Exploratory Analysis

## before sampling

In [None]:
sns.countplot(x='rating', data=df)
plt.title('Distribution of Ratings')
plt.show()

In [None]:
sns.countplot(x='sentiment', data=df)
plt.title('Distribution of Sentiments')
plt.show()

## After sampling

### Distribuzione sentiment

In [None]:
sentiment_counts = reviews['sentiment'].value_counts()

colors=['green','red']
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Number of Review')
plt.show()

### Distribuzione Rating

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', hue='sentiment', data=reviews, palette={'NEG': 'red', 'POS': 'green'})
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Number of Review')
plt.legend(title='Sentiment')
plt.show()

### Average Length of Reviews by Rating

In [None]:
reviews['char_length'] = reviews['preprocessed_text'].apply(lambda x: len(x))

avg_char_length_by_rating = reviews.groupby('rating')['char_length'].mean().reset_index()

col = ['red', 'red', 'red', 'green', 'green', 'green']

plt.figure(figsize=(10, 6))
plt.bar(avg_char_length_by_rating['rating'], avg_char_length_by_rating['char_length'],color=col)
plt.title('Average Length of Reviews by Rating')
plt.xlabel('Rating')
plt.ylabel('Average Length of Reviews')
plt.show()

### Bigram Analysis

In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_documents([d.split() for d in reviews['preprocessed_text']])

In [None]:
bigram_finder.apply_freq_filter(10)

In [None]:
bigram_freq = list(bigram_finder.ngram_fd.items())
bigram_freq

In [None]:
df_freq = pd.DataFrame([(' '.join(k), v) for k,v in bigram_freq], columns=['keyphrase', 'count'])
df_freq.sort_values(by='count', ascending=False, inplace=True)
df_freq.set_index('keyphrase', inplace = True)
df_freq

In [None]:
df_freq.head(20).sort_values(by='count').plot(kind = 'barh')
plt.title('Trending keyphrases')
plt.ylabel('phrase')
plt.xlabel('count')
plt.legend().set_visible(False)
plt.show()

### Trigram Analysis

In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import *
trigram_measures = nltk.collocations.TrigramAssocMeasures
trigram_finder = TrigramCollocationFinder.from_documents([d.split() for d in reviews['preprocessed_text']])

In [None]:
trigram_finder.apply_freq_filter(10)

In [None]:
trigram_freq = list(trigram_finder.ngram_fd.items())
trigram_freq

In [None]:
df_freq = pd.DataFrame([(' '.join(k), v) for k,v in trigram_freq], columns=['keyphrase', 'count'])
df_freq.sort_values(by='count', ascending=False, inplace=True)
df_freq.set_index('keyphrase', inplace = True)
df_freq

In [None]:
df_freq.head(20).sort_values(by='count').plot(kind = 'barh')
plt.title('Trending keyphrases')
plt.ylabel('phrase')
plt.xlabel('count')
plt.legend().set_visible(False)
plt.show()

# Text Representation

In [None]:
text_preprocessed = reviews['preprocessed_text']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer      #-- Tf-Idf
from gensim.models.doc2vec import Doc2Vec, TaggedDocument        #-- Doc2Vec
import joblib

### Tf-Idf

In [None]:
vectorizer_tfidf = TfidfVectorizer(ngram_range=(1,3),
                                    max_features=1000)

text_tfidf = vectorizer_tfidf.fit_transform(text_preprocessed.astype('U')).toarray()

In [None]:
print(text_tfidf.shape)

In [None]:
joblib.dump(text_tfidf, 'processed_tfidf.save')

## Doc2Vec

In [None]:
tagged_reviews = [TaggedDocument(words=review.split(), tags=[str(i)]) for i,
                  review in enumerate(text_preprocessed)]

In [None]:
d2v = Doc2Vec(vector_size=1000, window=5, min_count=1, workers=4, epochs=50)
d2v.build_vocab(tagged_reviews)
d2v.train(tagged_reviews, total_examples=d2v.corpus_count, epochs=d2v.epochs)

In [None]:
# generare gli embedding per i dati
text_d2v = [d2v.infer_vector(tokens.split()) for tokens in text_preprocessed]

In [None]:
len(text_d2v)

In [None]:
joblib.dump(text_d2v, 'processed_d2v.save')