In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from wordcloud import WordCloud

In [2]:
#nltk.download()

In [3]:
df = pd.read_csv('reviews.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Time_submitted  61594 non-null  object
 1   Review          61594 non-null  object
 2   Rating          61594 non-null  int64 
 3   Total_thumbsup  61594 non-null  int64 
 4   Reply           216 non-null    object
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


In [6]:
df.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09,"Great music service, the audio is high quality...",5,2,
1,2022-07-09,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09,Dear Spotify why do I get songs that I didn't ...,1,1,


In [7]:
df.isnull().sum()

Time_submitted        0
Review                0
Rating                0
Total_thumbsup        0
Reply             61378
dtype: int64

In [8]:
df.shape

(61594, 5)

In [9]:
df.drop('Reply',axis=1,inplace=True)

In [10]:
df.head(2)

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup
0,2022-07-09,"Great music service, the audio is high quality...",5,2
1,2022-07-09,Please ignore previous negative rating. This a...,5,1


In [11]:
df['Rating'].value_counts(normalize=True)

5    0.358720
1    0.286603
4    0.127318
2    0.115563
3    0.111797
Name: Rating, dtype: float64

In [12]:
df1 = df.copy()

In [13]:
df1['Sentiment'] = np.where(df1['Rating']>=4,1,0)

In [14]:
df1.head(10)

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Sentiment
0,2022-07-09,"Great music service, the audio is high quality...",5,2,1
1,2022-07-09,Please ignore previous negative rating. This a...,5,1,1
2,2022-07-09,"This pop-up ""Get the best Spotify experience o...",4,0,1
3,2022-07-09,Really buggy and terrible to use as of recently,1,1,0
4,2022-07-09,Dear Spotify why do I get songs that I didn't ...,1,1,0
5,2022-07-09,The player controls sometimes disappear for no...,3,7,0
6,2022-07-09,I love the selection and the lyrics are provid...,5,0,1
7,2022-07-09,Still extremely slow when changing storage to ...,3,16,0
8,2022-07-09,It's a great app and the best mp3 music app I ...,5,0,1
9,2022-07-09,"I'm deleting this app, for the following reaso...",1,318,0


In [15]:
df1.drop('Rating',axis=1,inplace=True)

In [16]:
df1.head(2)

Unnamed: 0,Time_submitted,Review,Total_thumbsup,Sentiment
0,2022-07-09,"Great music service, the audio is high quality...",2,1
1,2022-07-09,Please ignore previous negative rating. This a...,1,1


In [19]:
df1['Sentiment'].value_counts()

0    31657
1    29937
Name: Sentiment, dtype: int64

### Data Transformation

In [20]:
x = df1['Review']
y = df1['Sentiment']

In [23]:
from custom_tokenizer_function import CustomTokenizer

In [25]:
token = CustomTokenizer()

### Feature Engineering (TF-IDF)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
tfidf = TfidfVectorizer(tokenizer=token.text_data_cleaning)

### Train test split

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = df1.Sentiment, random_state = 0)

In [29]:
x_train.shape, x_test.shape

((49275,), (12319,))

### Model

In [30]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [31]:
classifier = LinearSVC()

In [32]:
pipeline = Pipeline([('tfidf',tfidf),('clf',classifier)])

In [33]:
pipeline.fit(x_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<bound method CustomTokenizer.text_data_cleaning of <custom_tokenizer_function.CustomTokenizer object at 0x000001E49B8C4AF0>>)),
                ('clf', LinearSVC())])

## Check Model Performance

In [34]:
y_pred = pipeline.predict(x_test)

In [35]:
y_pred_train = pipeline.predict(x_train)

In [36]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Train

In [37]:
confusion_matrix(y_train, y_pred_train)

array([[23654,  1671],
       [ 2703, 21247]], dtype=int64)

In [38]:
accuracy_score(y_train,y_pred_train)

0.9112328767123288

In [39]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.90      0.93      0.92     25325
           1       0.93      0.89      0.91     23950

    accuracy                           0.91     49275
   macro avg       0.91      0.91      0.91     49275
weighted avg       0.91      0.91      0.91     49275



### Test

In [40]:
confusion_matrix(y_test, y_pred)

array([[5581,  751],
       [1040, 4947]], dtype=int64)

In [41]:
accuracy_score(y_test,y_pred)

0.8546148226317071

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      6332
           1       0.87      0.83      0.85      5987

    accuracy                           0.85     12319
   macro avg       0.86      0.85      0.85     12319
weighted avg       0.86      0.85      0.85     12319



In [43]:
prediction = pipeline.predict(["Alexa is bad"])

if prediction == 1:
  print("Result: This review is positive")
else:
  print("Result: This review is negative")

Result: This review is negative


In [44]:
import joblib
joblib.dump(pipeline,'c1_SentimentAnalysis_Model_Pipeline.pkl')

['c1_SentimentAnalysis_Model_Pipeline.pkl']