# Spotify App Reviews

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

# Wordcloud
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from PIL import Image

# Text preprocessing
from string import punctuation
from textblob import TextBlob, Word
import nltk
nltk.download('omw-1.4')
import contractions
import re
import unidecode

import warnings

In [None]:
# Loading the data
df = pd.read_csv("reviews.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### Cleaning the data

In [None]:
class Clean:
    def __init__(self, data):
        self.data = data

    def drop_cols(self, columns_list):
        return self.data.drop(columns_list, axis=1)
    
    def drop_by_value(self, data_column, value):
        return self.data[data_column != value]
    
    def __getitem__(self, item):
        return self.data[item]

In [None]:
# Dropping useless columns
df = Clean(df)
df = df.drop_cols(['Time_submitted', 'Total_thumbsup', 'Reply'])

In [None]:
# Removing rating values equals 3 (neither positive or negative)
df = Clean(df)
df = df.drop_by_value(df['Rating'], 3)

In [None]:
# New classification column by the rating from reviews (1 or 2: Negative, 4 or 5: Positive)
df['NPS'] = np.where(df['Rating'] >= 4, 'Positive', 'Negative')
df.head()

## Exploratory Analysis (EDA)

In [None]:
# Number of characters of each Review (line-by-line)
df['Review_chrs_num'] = df['Review'].apply(lambda x: len(x))

sns.set_theme(font='Times New Roman')
df['Review_chrs_num'].hist(bins='auto', range=(0,520), figsize=(12,8), color='#81b71a')
plt.ylabel("Count", fontsize=15)
plt.xlabel("Number of characters", fontsize=15)
plt.title("Review characters distribuition", fontsize=18)
plt.show();

In [None]:
# Number of words of each Review (line-by-line)
df['Review_words_num'] = df['Review'].apply(lambda x: len(x.split()))

df['Review_words_num'].hist(bins='auto', range=(0, 110), figsize=(12,8), color='#81b71a')
plt.ylabel("Count", fontsize=15)
plt.xlabel("Number of words", fontsize=15)
plt.title("Review words distribuition", fontsize=18)
plt.show();

In [None]:
# Top 5 largest Reviews
df.sort_values(by='Review_chrs_num', ascending=False).head(5)

In [None]:
# Better visualization of the largest reviews
df_copy = df.copy()
largest = list(df_copy.sort_values('Review_chrs_num', ascending=False)['Review'])
largest[0:5]

In [None]:
# Top 5 shortest Reviews
df.sort_values(by='Review_chrs_num', ascending=True).head(5)

In [None]:
smallest = list(df_copy.sort_values('Review_chrs_num', ascending=True)['Review'])
smallest[0:5]

In [None]:
# WordCloud
text = " ".join(word for word in df['Review'])
stopwords = set(STOPWORDS)

spotify_mask = np.array(Image.open('Spotify-logo.jpg')) 
spotify_colormap = ImageColorGenerator(spotify_mask)

# Creating a wordcloud image
wc = WordCloud(
                mask=spotify_mask,
                stopwords=stopwords,
                background_color='white'
                ).generate(text)

# Generated image
plt.imshow(wc.recolor(color_func=spotify_colormap), interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Number of positive and negative Reviews
print(df['NPS'].value_counts())

df['NPS'].value_counts().plot(kind='bar', figsize=(12,8), color=['#81b71a', '#cc0000'])
plt.xlabel("NPS", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.title("Counting the positive and negative reviews")
plt.subplot
plt.show()

## Splitting the data

In [None]:
# Size of duplicate data
print(f"Size of dataframe with duplicated data: {df.shape}")
print(f"Percent of duplicated data: {df.duplicated().mean():.2%}")

df.drop_duplicates(keep='first', inplace=True)

In [None]:
# Dataframe only with Review and NPS
df = df[['Review', 'NPS']]
df.head(0)

In [None]:
# Split dataframe into train and test
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['NPS'])
del df

X_train = df_train['Review']
y_train = df_train['NPS']
X_val = None
y_val = None
X_test = df_test['Review']
y_test = df_test['NPS']

print(f"Train size: {df_train.shape}")
print(f"Test size: {df_test.shape}")

## Data Preprocessing

In [None]:
def text_preprocessor(text):
    # Removes punctuation
    text = ''.join([letter for letter in text if letter not in punctuation])

    # Lower the text
    text = text.lower()

    # Expandes contractions (for abbreviations)
    text = contractions.fix(text)

    # Removes accents
    text = unidecode.unidecode(text)

    # Removes urls
    text = re.sub(r'(http|https|ssh|ftp|www)\S+', '', text)

    # Removes white spaces
    text = re.sub(r'\s+', ' ', text)

    # Fix typos
    text = str(TextBlob(text).correct())

    # Lemmitazation
    text = ' '.join([Word(w).lemmatize() for w in text.split()])

    return text

# Caso quisessemos limpar o texto, como ocorre geralmente para NLP (porém ficou muito lento rodando o TextBlob)

In [None]:
# Count Vectors
cv = CountVectorizer()
cv.fit(X_train)

X_train_vectorized = cv.transform(X_train)
X_test_vectorized = cv.transform(X_test)

print(f"Count vector: {cv.get_feature_names_out()}")
print(f"Count vector size: {cv.get_feature_names_out().shape}")

In [None]:
transformed = pd.DataFrame.sparse.from_spmatrix(
                                                cv.transform(X_test),
                                                columns=cv.get_feature_names_out()
)
transformed = transformed.loc[:, (transformed != 0).any()] # Drop columns with all 0s
transformed.head(3)

## Training, Validation and Hyperparameters selection

### Logistic Regression

In [None]:
logreg_pipe = make_pipeline(CountVectorizer(),
                     LogisticRegression(max_iter=200, random_state=42))
logreg_pipe[1:].get_params()

In [None]:
warnings.filterwarnings("ignore")

param_grid = {"logisticregression__C": 10**np.arange(-3,3.01,1),
              "logisticregression__penalty": ['l1','l2','elasticnet','none']}

logreg_gs = GridSearchCV(logreg_pipe, param_grid, cv=5, scoring='balanced_accuracy')
logreg_gs.fit(X_train, y_train)

print("Best parameters:", logreg_gs.best_params_)
pd.DataFrame(data=logreg_gs.cv_results_).sort_values(by='rank_test_score').head()

In [None]:
logreg_model = make_pipeline(CountVectorizer(),
                               LogisticRegression(C=0.1, penalty='l2', max_iter=200, random_state=42))
logreg_model.fit(X_train, y_train)

print(f"Train balanced accuracy: {balanced_accuracy_score(y_train, logreg_model.predict(X_train)):.2%}")
print(f"Validation balanced accuracy (5-fold): {logreg_gs.best_score_:.2%}")

### SVM with RBF kernel

In [None]:
svm_pipe = make_pipeline(CountVectorizer(),
                         SVC(kernel='rbf', max_iter=200, random_state=42))
svm_pipe[1:].get_params()

In [None]:
param_grid = {'svc__C':     10.0**np.arange(-3, 3.01, 1), 
              'svc__gamma': 10.0**np.arange(-3, 3.01, 1)}

svm_gs = GridSearchCV(svm_pipe, param_grid, cv=5, scoring='balanced_accuracy')
svm_gs.fit(X_train, y_train)

print(f"Best parameters:", svm_gs.best_params_)
pd.DataFrame(data=svm_gs.cv_results_).sort_values(by='rank_test_score').head()

In [None]:
plt.scatter(svm_gs.cv_results_['param_svc__C'], svm_gs.cv_results_['param_svc__gamma'], c=svm_gs.cv_results_['mean_test_score'], cmap='jet');
plt.plot(svm_gs.best_params_['svc__C'], svm_gs.best_params_['svc__gamma'], 'ks', mfc='none', markersize=10)
plt.xlabel('C')
plt.ylabel('gamma')
plt.xscale('log')
plt.yscale('log')
plt.colorbar()
plt.show();

In [None]:
svm_model = make_pipeline(CountVectorizer(),
                          SVC(kernel='rbf', C=10, gamma=0.01, max_iter=200, random_state=42))
svm_model.fit(X_train, y_train)

print(f"Train balanced accuracy: {balanced_accuracy_score(y_train, svm_model.predict(X_train)):.2%}")
print(f"Validation balanced accuracy (5-fold): {svm_gs.best_score_:.2%}")

### Decision Tree

In [None]:
tree_pipe = make_pipeline(CountVectorizer(),
                          DecisionTreeClassifier(random_state=42))

tree_pipe[1:].get_params()

In [None]:
param_grid = {'decisiontreeclassifier__criterion': ['log_loss', 'entropy','gini'],
              'decisiontreeclassifier__max_depth': np.arange(5,15),
            #  'decisiotreeclassifier__min_samples_split': np.arange(),
            #  'decisiontreeclassifier__min_samples_leaf': np.arange(),
            #  'decisiontreeclassifier__max_leaf_nodes': np.arange(),
            #  'decisiontreeclassfier__max_features': np.arange(),
            #  'decisiontreeclassifier__min_impurity_decrease': np.arange(),
              'decisiontreeclassifier__ccp_alpha': np.arange(1,5)/1000}

tree_gs = GridSearchCV(tree_pipe, param_grid, cv=5, scoring='balanced_accuracy')
tree_gs.fit(X_train, y_train)

print(f"Best parameters:", tree_gs.best_params_)
pd.DataFrame(data=tree_gs.cv_results_).sort_values(by='rank_test_score').head()

In [None]:
tree_model = make_pipeline(CountVectorizer(),
                           DecisionTreeClassifier(criterion='log_loss',max_depth=14, ccp_alpha=0.001, random_state=42))
tree_model.fit(X_train, y_train)

print(f"Train balanced accuracy: {balanced_accuracy_score(y_train, tree_model.predict(X_train)):.2%}")
print(f"Validation balanced accuracy (5-fold): {tree_gs.best_score_:.2%}")

### Random Forest

In [None]:
rf_pipe = make_pipeline(CountVectorizer(),
                        RandomForestClassifier(random_state=42))

rf_pipe[1:].get_params()

In [None]:
param_grid = {'randomforestclassifier__criterion': ['gini', 'entropy', 'log_loss'],
            #  'randomforestclassifier__n_estimators': np.arange(100,500.01,100),
              'randomforestclassifier__max_depth': np.arange(10,15),
            #  'randomforestclassifier__max_features': ['auto','sqrt'],
            #  'randomforestclassifier__min_samples_leaf': np.arange(),
            #  'randomforestclassifier__min_samples_split': np.arange(),
            #  'randomforestclassifier__bootstrap': [True, False],
              'randomforestclassifier__ccp_alpha': np.arange(1,5)/1000}

rf_gs = GridSearchCV(rf_pipe, param_grid, cv=5, scoring='balanced_accuracy')
rf_gs.fit(X_train, y_train)

print(f"Best parameters:", rf_gs.best_params_)
pd.DataFrame(data=rf_gs.cv_results_).sort_values(by='rank_test_score').head()

In [None]:
rf_model = make_pipeline(CountVectorizer(),
                         RandomForestClassifier(criterion='entropy',max_depth=14, ccp_alpha=0.001, random_state=42))
rf_model.fit(X_train, y_train)

print(f"Train balanced accuracy: {balanced_accuracy_score(y_train, rf_model.predict(X_train)):.2%}")
print(f"Validation balanced accuracy (5-fold): {rf_gs.best_score_:.2%}")

### Gradient Boosting

In [None]:
gb_pipe = make_pipeline(CountVectorizer(),
                        GradientBoostingClassifier(random_state=42))
gb_pipe[1:].get_params()

In [None]:
warnings.filterwarnings("ignore")

param_grid = {
            #  'gradientboostingclassifier__max_depth': np.arange(10,15),
              'gradientboostingclassifier__learning_rate': [0.01,0.1,1],
            #  'gradientboostingclassifier__loss':  ['log_loss', 'exponential'],
            #  'gradientboostingclassifier__n_estimators': [100, 200, 300],
              'gradientboostingclassifier__criterion': ['friedman_mse', 'mae', 'squared_error'],
            #  'gradientboostingclassifier__min_samples_split': np.arange(),
            #  'gradientboostingclassifier__min_samples_leaf': np.arange(),
            #  'gradientboostingclassifier__min_weight_fraction_leaf': np.arange(),
            #  'gradientboostingclassifier__max_features': np.arange(),
            #  'gradientboostingclassifier__max_leaf_nodes': np.arange(),
            #  'gradientboostingclassifier__min_impurity_decrease': np.arange(),
              'gradientboostingclassifier__ccp_alpha': np.arange(1,5)/1000
}

gb_gs = GridSearchCV(gb_pipe, param_grid, cv=5, scoring='balanced_accuracy')
gb_gs.fit(X_train, y_train)

print(f"Best params:", gb_gs.best_params_)
pd.DataFrame(data=gb_gs.cv_results_).sort_values(by='rank_test_score').head()

In [None]:
gb_model = make_pipeline(CountVectorizer(),
                         GradientBoostingClassifier(ccp_alpha=0.001, criterion='friedman_mse', learning_rate=0.1, random_state=42))
gb_model.fit(X_train, y_train)

print(f"Train balanced accuracy: {balanced_accuracy_score(y_train, gb_model.predict(X_train)):.2%}")
print(f"Validation balanced accuracy (5-fold): {gb_gs.best_score_:.2%}")

## Result Analysis


- Modelo escolhido: SVM com kernel rbf

**Retreinamento**: Como, para a validação cruzada k-fold, utilizamos refit=True no GridSearch para otimizar os parâmetros e depois treinamos o modelo com os dados já otimizados com o conjunto de treinamento completo, não há necessidade de fazer um retreinamento. Sendo assim, continuarei com os próximos tópicos a seguir.

In [None]:
# Using logistic regression model for the test dataset
y_pred = logreg_model.predict(X_test)

print(f"Test balanced accuracy: {balanced_accuracy_score(y_test, y_pred):.2%}")

In [None]:
# Confusion Matrix by logistic regression
cfn_matrix = confusion_matrix(y_test, logreg_model.predict(X_test))
print(f"Confusion Matrix: "'\n', cfn_matrix)

tnr = cfn_matrix[0,0]/(cfn_matrix[0,0] + cfn_matrix[0,1])
fpr = cfn_matrix[0,1]/(cfn_matrix[0,0] + cfn_matrix[0,1])
fnr = cfn_matrix[1,0]/(cfn_matrix[1,0] + cfn_matrix[1,1])
tpr = cfn_matrix[1,1]/(cfn_matrix[1,0] + cfn_matrix[1,1])
acc = (cfn_matrix[0,0] + cfn_matrix[1,1])/cfn_matrix.sum()
bac = (tnr + tpr)/2

print('\n' f"True negative rate: {tnr: >6.2%}")
print(f"False positive rate: {fpr: >6.2%}")
print(f"False negative rate: {fnr: >6.2%}")
print(f"True positive rate: {tpr: >6.2%}")
print(f"Accuracy: {acc: >6.2%}")
print(f"Balanced accuracy: {bac: >6.2%}")

In [None]:
# Top 10 words by importance
feature_imp = abs(logreg_model[1].coef_[0])
feature_names = logreg_model[0].get_feature_names_out()

imp_df = pd.DataFrame({'words': feature_names, 'coef': feature_imp})
imp_df.sort_values(by='coef', ascending=False).head(10)

In [None]:
# Comparing y_test and y_pred
comparison = pd.DataFrame({'Review': X_test, 'y_test':y_test, 'y_pred':y_pred})
comparison['is_Correct'] = comparison['y_test'] == comparison['y_pred']

comparison.head(20)

In [None]:
# 5 mistakes made by prediction
wrong_pred = comparison.loc[comparison['is_Correct'] != True]

wrong_pred.head()

## Comentários e discussão

a) Principais pontos aprendidos:
- Aproveitei os passos iniciais para criar uma classe de limpeza do dataset, para relembrar esses conceitos que as vezes não exercitamos;
- Primeira vez criando uma WordCloud (tentei fazer com o formato do logo do Spotify mas não sei se deu muito certo);
- Utilização do CountVectorizer pela primeira vez;
- Utilização de pipelines em conjunto com o gridsearch e acurácia balanceada;
- Não sabia se na parte de maiores comentários o trabalho se referia com relação ao número de palavras ou caracteres do comentários;
- Aqui vai um agradecimento ao Nicolas, que sempre ajuda bastante nas dúvidas e é bem prestativo.

b) Principais limitações na abordagem utilizada:
- Preprocessamento dos Reviews poderia ser mais elaborado (como na função que deixei exposta no notebook), utilizando mais técnicas de NLP, fazendo com que as predições fossem mais adequadas;
- O modelo, devido também ao preprocessamento, aparenta estar muito sujeito a overfitting dependendo do modelo que utilizamos e de como otimizamos os hiperparâmetros.