In [2]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /usr/share/nltk_data...


True

In [3]:
import numpy as np 
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import Word
from textblob import TextBlob
# Libraries for visualization:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")

<a id="1"></a>

# **LOAD and CHECK DATA**

In [4]:
spotify=pd.read_csv("../input/spotify-app-reviews-2022/reviews.csv",usecols=["Review","Rating"])
data=spotify.copy()
data.head()

Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality...",5
1,Please ignore previous negative rating. This a...,5
2,"This pop-up ""Get the best Spotify experience o...",4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didn't ...,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  61594 non-null  object
 1   Rating  61594 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 962.5+ KB


<a id="2"></a>

# **BASIC EDA & VISUALIZATION**

In [6]:
fig=px.histogram(data,x="Rating")
fig.show()

In [7]:
# Check for missing values:
data.isnull().sum()

Review    0
Rating    0
dtype: int64

In [8]:
data["Rating"].replace(1, value="negatif",inplace=True)
data["Rating"].replace(2, value="negatif",inplace=True)
data["Rating"].replace(3, value="neutral",inplace=True)
data["Rating"].replace(4, value="positive",inplace=True)
data["Rating"].replace(5, value="positive",inplace=True)
data.head()

Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality...",positive
1,Please ignore previous negative rating. This a...,positive
2,"This pop-up ""Get the best Spotify experience o...",positive
3,Really buggy and terrible to use as of recently,negatif
4,Dear Spotify why do I get songs that I didn't ...,negatif


In [9]:
fig=px.histogram(data,x="Rating")
fig.show()

<a id="3"></a>

# **TEXT PREPROCESSING**

<a id="4"></a>

# **Removing Punctuation**

In [10]:
import re
data["Review"]=data["Review"].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))
data.head(2)

Unnamed: 0,Review,Rating
0,Great music service the audio is high quality ...,positive
1,Please ignore previous negative rating This ap...,positive


<a id="5"></a>

# **Remove URL and Tags**

In [11]:
data["Review"]=data["Review"].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))

<a id="6"></a>

# **Lower Casing**

In [12]:
data["Review"]=data["Review"].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.head(2)

Unnamed: 0,Review,Rating
0,great music service the audio is high quality ...,positive
1,please ignore previous negative rating this ap...,positive


<a id="7"></a>

# **Removing Numbers**

In [13]:
data["Review"]=data["Review"].str.replace("\d","")

<a id="8"></a>

# **Removing Extra Spaces**

In [14]:
data["Review"]=data["Review"].apply(lambda x: " ".join(x.split()))

<a id="9"></a>

# **Stop Words**

In [15]:
import nltk
from nltk.corpus import stopwords
sw=stopwords.words("english")
data["Review"]=data["Review"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
data.head()

Unnamed: 0,Review,Rating
0,great music service audio high quality app eas...,positive
1,please ignore previous negative rating app sup...,positive
2,pop-up get best spotify experience android ann...,positive
3,really buggy terrible use recently,negatif
4,dear spotify get songs didnt put playlist shuf...,negatif


<a id="10"></a>

# **Lemmatization**

In [16]:
from textblob import Word
nltk.download("wordnet")
data["Review"]=data["Review"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data.head()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Review,Rating
0,great music service audio high quality app eas...,positive
1,please ignore previous negative rating app sup...,positive
2,pop-up get best spotify experience android ann...,positive
3,really buggy terrible use recently,negatif
4,dear spotify get song didnt put playlist shuff...,negatif


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  61594 non-null  object
 1   Rating  61594 non-null  object
dtypes: object(2)
memory usage: 962.5+ KB


<a id="11"></a>

# **FEATURE ENGINEERING**

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data["Review"],data["Rating"],random_state=1)
print("X_train shape",X_train.shape)
print("y_train shape",y_train.shape)
print("X_test shape",X_test.shape)
print("y_test shape",y_test.shape)

X_train shape (46195,)
y_train shape (46195,)
X_test shape (15399,)
y_test shape (15399,)


In [19]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y_train=encoder.fit_transform(y_train)
y_test=encoder.fit_transform(y_test)
print("y Train: ",y_train[0:10])
print("y Test: ",y_test[0:10])

y Train:  [0 1 2 0 2 1 1 2 0 1]
y Test:  [2 1 1 2 0 2 0 0 0 2]


<a id="12"></a>

# **1) COUNT VECTORS**

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
count_vectorizer=CountVectorizer()
count_vectorizer.fit(X_train)
X_train_count=count_vectorizer.transform(X_train)
X_test_count=count_vectorizer.transform(X_test)

In [22]:
# Lets see:
#count_vectorizer.get_feature_names()[0:10]

<a id="13"></a>

# **2) TF - IDF**

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

<a id="14"></a>

# **Word Level TF - IDF**

In [24]:
tfIdf_word_level=TfidfVectorizer()
tfIdf_word_level.fit(X_train)
X_train_word_level=tfIdf_word_level.transform(X_train)
X_test_word_level=tfIdf_word_level.transform(X_test)

In [25]:
#tfIdf_word_level.get_feature_names()[0:10]

<a id="15"></a>

# **n - Gram Level TF IDF**

In [26]:
tfIdf_Ngram=TfidfVectorizer(ngram_range=(2,3))
tfIdf_Ngram.fit(X_train)
X_train_Ngram=tfIdf_Ngram.transform(X_train)
X_test_Ngram=tfIdf_Ngram.transform(X_test)

In [27]:
#tfIdf_Ngram.get_feature_names()[0:10]

<a id="16"></a>

# **Character Level TF - IDF**

In [28]:
tfIdf_char=TfidfVectorizer(analyzer="char",ngram_range=(2,3))
tfIdf_char.fit(X_train)
X_train_char=tfIdf_char.transform(X_train)
X_test_char=tfIdf_char.transform(X_test)

In [29]:
#tfIdf_char.get_feature_names()

<a id="17"></a>

# **MODEL & RESULTS**

<a id="18"></a>

## **LOGISTIC REGRESSION**

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
logistic=LogisticRegression()

In [31]:
logistic_model_count=logistic.fit(X_train_count,y_train)
log_cross_val_count=cross_val_score(logistic_model_count,X_test_count,y_test,cv=10).mean()
print("Cross Validation of Count Vector in Logistic Regression: ",log_cross_val_count)

Cross Validation of Count Vector in Logistic Regression:  0.7584905445431762


In [32]:
logistic_model_word=logistic.fit(X_train_word_level,y_train)
log_cross_val_word=cross_val_score(logistic_model_word,X_test_word_level,y_test,cv=10).mean()
print("Cross Validation of Word Level in Logistic Regression: ",log_cross_val_word)

Cross Validation of Word Level in Logistic Regression:  0.7789465667535843


In [33]:
logistic_model_Ngram=logistic.fit(X_train_Ngram,y_train)
log_cross_val_Ngram=cross_val_score(logistic_model_word,X_test_Ngram,y_test,cv=10).mean()
print("Cross Validation of N-Gram in Logistic Regression: ",log_cross_val_Ngram)

Cross Validation of N-Gram in Logistic Regression:  0.7445931748563328


In [34]:
logistic_model_char=logistic.fit(X_train_char,y_train)
log_cross_val_char=cross_val_score(logistic_model_char,X_test_char,y_test,cv=10).mean()
print("Cross Validation of Character Level in Logistic Regression: ",log_cross_val_char)

Cross Validation of Character Level in Logistic Regression:  0.7728426284566636


<a id="19"></a>

# **Logistic Regression Model Results**

In [35]:
logistic_results=pd.DataFrame({"Values": [log_cross_val_count,log_cross_val_word,log_cross_val_Ngram,log_cross_val_char],
                              "Function":["Count Vector","Word Level TF-IDF","N-Gram","Character Level TF-IDF"]})
fig=make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(x=[round(i,5) for i in logistic_results["Values"]],
                    y=logistic_results["Function"],
                    text=[round(i,5) for i in logistic_results["Values"]],orientation="h",textposition="inside",name="Values",
                    marker=dict(color=["lightcoral","palegreen","deepskyblue","lightpink"],line_color="beige",line_width=1.5)),row=1,col=1)
fig.update_layout(title={"text": "Logistic Regression",
                        "y":0.9,
                        "x":0.5,
                        "xanchor":"center",
                        "yanchor":"top"},
                 template="plotly_white")
fig.update_xaxes(row=1,col=1)
iplot(fig)

<a id="20"></a>

# **K-NEAREST NEIGHBORS CLASSIFIER**

In [36]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()

In [37]:
knn_model_count=knn.fit(X_train_count,y_train)
knn_cross_val_count=cross_val_score(knn_model_count,X_test_count,y_test,cv=10).mean()
print("Cross Validation of Count Vector in KNN Classifier: ",knn_cross_val_count)

Cross Validation of Count Vector in KNN Classifier:  0.6373796022041637


In [38]:
knn_model_word=knn.fit(X_train_word_level,y_train)
knn_cross_val_word=cross_val_score(knn_model_word,X_test_word_level,y_test,cv=10).mean()
print("Cross Validation of Word Level in KNN Classifier: ",knn_cross_val_word)

Cross Validation of Word Level in KNN Classifier:  0.6900449355712513


In [39]:
knn_model_Ngram=knn.fit(X_train_Ngram,y_train)
knn_cross_val_Ngram=cross_val_score(knn_model_Ngram,X_test_Ngram,y_test,cv=10).mean()
print("Cross Validation of N-Gram in KNN Classifier: ",knn_cross_val_Ngram)

Cross Validation of N-Gram in KNN Classifier:  0.48243398901293644


In [40]:
knn_model_char=knn.fit(X_train_char,y_train)
knn_cross_val_char=cross_val_score(knn_model_char,X_test_char,y_test,cv=10).mean()
print("Cross Validation of Character Level in KNN Classifier: ",knn_cross_val_char)

Cross Validation of Character Level in KNN Classifier:  0.7021888897327495


<a id="21"></a>

# **K-Nearest Neighbor Results**

In [41]:
knn_results=pd.DataFrame({"Values": [knn_cross_val_count,knn_cross_val_word,knn_cross_val_Ngram,knn_cross_val_char],
                              "Function":["Count Vector","Word Level TF-IDF","N-Gram","Character Level TF-IDF"]})
fig=make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(x=[round(i,5) for i in knn_results["Values"]],
                    y=knn_results["Function"],
                    text=[round(i,5) for i in knn_results["Values"]],orientation="h",textposition="inside",name="Values",
                    marker=dict(color=["lightcoral","palegreen","deepskyblue","lightpink"],line_color="beige",line_width=1.5)),row=1,col=1)
fig.update_layout(title={"text": "K-Nearest Neighbors ",
                        "y":0.9,
                        "x":0.5,
                        "xanchor":"center",
                        "yanchor":"top"},
                 template="plotly_white")
fig.update_xaxes(row=1,col=1)
iplot(fig)

<a id="22"></a>

# **XG BOOST CLASSIFIER**

In [42]:
from xgboost import XGBClassifier
xgb=XGBClassifier()

In [43]:
xgb_model_count=xgb.fit(X_train_count,y_train)
xgb_cross_val_count=cross_val_score(xgb_model_count,X_test_count,y_test,cv=10).mean()
print("Cross Validation of Count Vector in XG BOOST Classifier: ",xgb_cross_val_count)

Cross Validation of Count Vector in XG BOOST Classifier:  0.7667379728783238


In [44]:
xgb_model_word=xgb.fit(X_train_word_level,y_train)
xgb_cross_val_word=cross_val_score(xgb_model_word,X_test_word_level,y_test,cv=10).mean()
print("Cross Validation of Word Level in XG BOOST Classifier: ",xgb_cross_val_word)

Cross Validation of Word Level in XG BOOST Classifier:  0.7625821287224797


In [45]:
xgb_model_Ngram=xgb.fit(X_train_Ngram,y_train)
xgb_cross_val_Ngram=cross_val_score(xgb_model_Ngram,X_test_Ngram,y_test,cv=10).mean()
print("Cross Validation of N-Gram in XG BOOST Classifier: ",xgb_cross_val_Ngram)

Cross Validation of N-Gram in XG BOOST Classifier:  0.6871877083280592


In [46]:
xgb_model_char=xgb.fit(X_train_char,y_train)
xgb_cross_val_char=cross_val_score(xgb_model_char,X_test_char,y_test,cv=10).mean()
print("Cross Validation of Character Level in XG BOOST Classifier: ",xgb_cross_val_char)

Cross Validation of Character Level in XG BOOST Classifier:  0.7614781060395096


<a id="23"></a>

# **XG Boost Classifier Results**

In [47]:
xgb_results=pd.DataFrame({"Values": [xgb_cross_val_count,xgb_cross_val_word,xgb_cross_val_Ngram,xgb_cross_val_char],
                              "Function":["Count Vector","Word Level TF-IDF","N-Gram","Character Level TF-IDF"]})
fig=make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(x=[round(i,5) for i in xgb_results["Values"]],
                    y=knn_results["Function"],
                    text=[round(i,5) for i in xgb_results["Values"]],orientation="h",textposition="inside",name="Values",
                    marker=dict(color=["lightcoral","palegreen","deepskyblue","lightpink"],line_color="beige",line_width=1.5)),row=1,col=1)
fig.update_layout(title={"text": "XG BOOST ",
                        "y":0.9,
                        "x":0.5,
                        "xanchor":"center",
                        "yanchor":"top"},
                 template="plotly_white")
fig.update_xaxes(row=1,col=1)
iplot(fig)