# B2W-Reviews01

More information: https://github.com/americanas-tech/b2w-reviews01/blob/main/README.md

## Configuration

In [None]:
## If you are using a Folder on Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

folder = ''

In [None]:
import time
import joblib
import requests

from io import BytesIO

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud

In [None]:
import nltk
from nltk import word_tokenize

nltk.download('punkt')

In [None]:
!pip install transformers sentencepiece -q

In [None]:
from transformers import pipeline

model_path = "citizenlab/twitter-xlm-roberta-base-sentiment-finetunned"

sentiment_classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)

In [None]:
!pip install spacy -q
!python -m spacy download pt_core_news_lg -q

In [None]:
import spacy

nlp_sp = spacy.load('pt_core_news_lg')

stop_words_pt = nlp_sp.Defaults.stop_words

In [None]:
df = pd.read_csv(folder + 'B2W-Reviews01.csv')

column = 'review_text'
file_suffix = 'Text'

## Exploratory data analysis

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head(2)

In [None]:
df_numeric = df[['overall_rating', 'reviewer_birth_year']]
print(f'numeric columns mean: \n{df_numeric.mean()}\n')
print(f'numeric columns median: \n{df_numeric.median()}\n')
print(f'numeric columns min: \n{df_numeric.min()}\n')
print(f'numeric columns max: \n{df_numeric.max()}\n')

In [None]:
df['submission_date'].head()

In [None]:
print('number of users:', len(df['reviewer_id'].unique()))
print('number of products:', len(df['product_id'].unique()))
print('number of brands:', len(df['product_brand'].unique()))
print('number of categories level 1:', len(df['site_category_lv1'].unique()))
print('number of categories level 2:', len(df['site_category_lv2'].unique()))

In [None]:
df.groupby('reviewer_state')['reviewer_state'].count()

In [None]:
df.groupby('reviewer_gender')['reviewer_gender'].count()

In [None]:
df.groupby('recommend_to_a_friend')['recommend_to_a_friend'].count()

## Plotting data

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5),)
df.groupby('reviewer_state')['reviewer_state'].count().plot(kind='bar', ax=axes[0])
sns.histplot(df.groupby('reviewer_state')['reviewer_state'].count(), kde=True, ax=axes[1])
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5),)
df.groupby('reviewer_gender')['reviewer_gender'].count().plot(kind='pie', ax=axes[0])
df.groupby('recommend_to_a_friend')['recommend_to_a_friend'].count().plot(kind='pie', ax=axes[1])
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(df['overall_rating'], ax=axes[0])
sns.histplot(df['overall_rating'], ax=axes[1])
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.boxplot(df['reviewer_birth_year'], ax=axes[0])
sns.boxplot(df[df['reviewer_birth_year'] > 1900]['reviewer_birth_year'], ax=axes[1])
sns.histplot(df[df['reviewer_birth_year'] > 1900]['reviewer_birth_year'], kde=True, ax=axes[2])
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.histplot(df.groupby('product_brand')['product_brand'].count(), kde=True, binwidth=3, ax=axes[0])
sns.histplot(df.groupby('site_category_lv2')['site_category_lv2'].count(), kde=True, ax=axes[1])
sns.histplot(df.groupby('site_category_lv1')['site_category_lv1'].count(), kde=True, ax=axes[2])
plt.show()

## Processing user reviews

### Using Bag-of-Words

#### Spacy Aspect-Extraction

In [None]:
df_nouns = df[df[column].isna() == False].copy()
df_nouns['review_nouns'] = ''
df_nouns['review_lemmas'] = ''

j = 0
for i, r in df_nouns.iterrows():
  r['review_nouns'] = []
  r['review_lemmas'] = []

  for w in nlp_sp(r[column]):
    if w.text.lower() not in stop_words_pt and w.pos_ == 'NOUN' and w.lemma_:
      r['review_nouns'].append(w.text)
      r['review_lemmas'].append(w.lemma_)

  r['review_nouns'] = " ".join(r['review_nouns'])
  r['review_lemmas'] = " ".join(r['review_lemmas'])

  df_nouns.loc[i] = r

  j += 1
  if j % 10000 == 0:
    print('Interaction:', j)

df_nouns.to_csv(folder + '/B2W-Reviews01' + file_suffix + 'Nouns.csv')

### Sentiment Analysis

In [None]:
start = time.time()

df_sent = df_nouns.copy()
df_sent['review_sent_label'] = ''
df_sent['review_sent_score'] = ''

j = 0
for i, r in df_sent.iterrows():
  try:
    s = sentiment_classifier(r[column])[0]
    r['review_sent_label'] = s['label']
    r['review_sent_score'] = s['score']
  except:
    print("Error: ", i)
  df_sent.loc[i] = r
  j += 1
  if j % 10000 == 0:
    print('Interaction:', j, 'Elapsed time:', (time.time() - start))
    df_sent.to_csv(folder + '/B2W-Reviews01' + file_suffix + 'Sentiments.csv')
df_sent.to_csv(folder + '/B2W-Reviews01' + file_suffix + 'Sentiments.csv')
print('Elapsed time:', (time.time() - start))

### End