<a href="https://colab.research.google.com/github/marioxgonzalez/CoinBase_Reviews/blob/main/Coinbase_Review_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### data prepare

In [None]:
!pip install textblob



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
nltk.download('vader_lexicon')

from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
df = pd.read_csv("/content/drive/MyDrive/CoinBase_Reviews/reviews_cb.csv")
print('Number of training reviews: {:,}\n'.format(df.shape[0]))

df = df[['reviewId','content','score','at']]
df.dropna(subset = ['content','score'], inplace=True)
df = df[df.content != "👍"]
df = df.rename(columns={"content": "Review"})
df.reset_index(inplace=True)
text = df.copy()
text['Review'] = text['Review'].astype(str)

def get_sentiment(score):
  if score >= 4:
    return "Positive"
  if score <= 3:
    return "Negative" 


text['Class'] = text.score.apply(get_sentiment)

Number of training reviews: 129,681



In [None]:
text.head(2)

Unnamed: 0,index,reviewId,Review,score,at,Class
0,0,gp:AOqpTOGVaBRFk70o8DKCdhFzZi0JCPLEPDDDJ3CKuck...,Cant access my assets on the app keeps saying ...,1,2022-03-24 19:23:48,Negative
1,1,gp:AOqpTOFIgh-VJEbHs1Gka0ZTveGsbm_SuiriezZl3yA...,app works and it's a great exchange if you hod...,5,2022-03-24 18:44:50,Positive


In [None]:
print(text['Class'].value_counts())  # need to drop NA

Positive    76001
Negative    53549
Name: Class, dtype: int64


In [None]:
text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129550 entries, 0 to 129549
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   index     129550 non-null  int64 
 1   reviewId  129550 non-null  object
 2   Review    129550 non-null  object
 3   score     129550 non-null  int64 
 4   at        129550 non-null  object
 5   Class     129550 non-null  object
dtypes: int64(2), object(4)
memory usage: 5.9+ MB


## **Vader Sentiment**

In [None]:
analyser = SentimentIntensityAnalyzer()

def sentiment_scores(text):
    score = analyser.polarity_scores(text)
    print("{:-<40} {}".format(text, str(score)))

In [None]:
sentiment_scores(text['Review'].iloc[0])

Cant access my assets on the app keeps saying unable to connect. Unistalled and reinstalled still having problems.. {'neg': 0.082, 'neu': 0.918, 'pos': 0.0, 'compound': -0.1326}


In [None]:
text['Scores'] = text['Review'].apply(lambda Review: analyser.polarity_scores(Review))
text['Compound'] = text['Scores'].apply(lambda score_dict: score_dict['compound'])
text.head()

Unnamed: 0,index,reviewId,Review,score,at,Class,Scores,Compound
0,0,gp:AOqpTOGVaBRFk70o8DKCdhFzZi0JCPLEPDDDJ3CKuck...,Cant access my assets on the app keeps saying ...,1,2022-03-24 19:23:48,Negative,"{'neg': 0.082, 'neu': 0.918, 'pos': 0.0, 'comp...",-0.1326
1,1,gp:AOqpTOFIgh-VJEbHs1Gka0ZTveGsbm_SuiriezZl3yA...,app works and it's a great exchange if you hod...,5,2022-03-24 18:44:50,Positive,"{'neg': 0.0, 'neu': 0.745, 'pos': 0.255, 'comp...",0.6249
2,2,gp:AOqpTOHW8-h8WM8HvltxhQitwvwTYYTMX5bQ_MGUrR_...,It's not very user friendly. It takes forever ...,2,2022-03-24 18:44:36,Negative,"{'neg': 0.121, 'neu': 0.771, 'pos': 0.107, 'co...",-0.0859
3,3,gp:AOqpTOFFFfPx3hgLsDBNqsp47DFck1sUpjNtp5PyOHh...,Nice 👍,3,2022-03-24 18:08:15,Negative,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",0.4215
4,4,gp:AOqpTOETDaY3eiHa50nRwTbIboL0YDEMXjn8EufvGJj...,"works pretty good, but have now transfered my ...",5,2022-03-24 18:02:23,Positive,"{'neg': 0.0, 'neu': 0.712, 'pos': 0.288, 'comp...",0.4678


In [None]:
def sentiment (score):
    if score >= 0.15:
        return 'Positive'
    else:
        return 'Negative'

In [None]:
text['Sentiment'] = text['Compound'].apply(sentiment)
text.head(10)

Unnamed: 0,index,reviewId,Review,score,at,Class,Scores,Compound,Sentiment
0,0,gp:AOqpTOGVaBRFk70o8DKCdhFzZi0JCPLEPDDDJ3CKuck...,Cant access my assets on the app keeps saying ...,1,2022-03-24 19:23:48,Negative,"{'neg': 0.082, 'neu': 0.918, 'pos': 0.0, 'comp...",-0.1326,Negative
1,1,gp:AOqpTOFIgh-VJEbHs1Gka0ZTveGsbm_SuiriezZl3yA...,app works and it's a great exchange if you hod...,5,2022-03-24 18:44:50,Positive,"{'neg': 0.0, 'neu': 0.745, 'pos': 0.255, 'comp...",0.6249,Positive
2,2,gp:AOqpTOHW8-h8WM8HvltxhQitwvwTYYTMX5bQ_MGUrR_...,It's not very user friendly. It takes forever ...,2,2022-03-24 18:44:36,Negative,"{'neg': 0.121, 'neu': 0.771, 'pos': 0.107, 'co...",-0.0859,Negative
3,3,gp:AOqpTOFFFfPx3hgLsDBNqsp47DFck1sUpjNtp5PyOHh...,Nice 👍,3,2022-03-24 18:08:15,Negative,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",0.4215,Positive
4,4,gp:AOqpTOETDaY3eiHa50nRwTbIboL0YDEMXjn8EufvGJj...,"works pretty good, but have now transfered my ...",5,2022-03-24 18:02:23,Positive,"{'neg': 0.0, 'neu': 0.712, 'pos': 0.288, 'comp...",0.4678,Positive
5,5,gp:AOqpTOH1KQ8sMNALLZ6thALflAAOuKSLSBUj5xH_qQj...,For almost a month I have been unable to fully...,2,2022-03-24 17:41:30,Negative,"{'neg': 0.046, 'neu': 0.838, 'pos': 0.116, 'co...",0.736,Positive
6,6,gp:AOqpTOGgCbNHwRDj06r4YwVybVOlHvLWTMg30gt_Gu0...,Cool,5,2022-03-24 17:38:21,Positive,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound...",0.3182,Positive
7,7,gp:AOqpTOE3jymV_bQVuF9H3tqfpi07GcuUO8alMRdYYH6...,they make it so easy,5,2022-03-24 17:35:52,Positive,"{'neg': 0.0, 'neu': 0.517, 'pos': 0.483, 'comp...",0.5777,Positive
8,8,gp:AOqpTOEn9yJBFKaCy_jkN07zFsOMEKgVgucCPESRB_B...,Can't log in to my account for 3months after s...,1,2022-03-24 17:05:57,Negative,"{'neg': 0.041, 'neu': 0.607, 'pos': 0.351, 'co...",0.9231,Positive
9,9,gp:AOqpTOGQrinMFtuXzM87mfieU75S8xrRFnqb-Rk5upG...,I like everything about Coinbase for the most ...,1,2022-03-24 16:48:40,Negative,"{'neg': 0.125, 'neu': 0.813, 'pos': 0.062, 'co...",-0.6326,Negative


In [None]:
text['Sentiment'].value_counts()

Positive    76865
Negative    52685
Name: Sentiment, dtype: int64

In [None]:
accuracy_score(text['Class'],text['Sentiment'])

0.774203010420687

In [None]:
print(classification_report(text['Class'],text['Sentiment']))

              precision    recall  f1-score   support

    Negative       0.73      0.72      0.72     53549
    Positive       0.80      0.81      0.81     76001

    accuracy                           0.77    129550
   macro avg       0.77      0.77      0.77    129550
weighted avg       0.77      0.77      0.77    129550



In [None]:
print(confusion_matrix(text['Class'],text['Sentiment']))

[[37687 15862]
 [13938 62063]]


In [None]:
#Importing Essentials
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

X = text.Review
y = text.Class
#Using CountVectorizer to convert text into tokens/features
vect = CountVectorizer(stop_words='english', ngram_range = (1,1), max_df = .80, min_df = 4)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size= 0.2)
#Using training data to transform text into counts of features for each message
vect.fit(X_train)
X_train_dtm = vect.transform(X_train) 
X_test_dtm = vect.transform(X_test)

#Accuracy using Naive Bayes Model
NB = MultinomialNB()
NB.fit(X_train_dtm, y_train)
y_pred = NB.predict(X_test_dtm)
print('\nNaive Bayes')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

#Accuracy using Logistic Regression Model
LR = LogisticRegression()
LR.fit(X_train_dtm, y_train)
y_pred = LR.predict(X_test_dtm)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

#Accuracy using SVM Model
SVM = LinearSVC()
SVM.fit(X_train_dtm, y_train)
y_pred = SVM.predict(X_test_dtm)
print('\nSupport Vector Machine')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

#Accuracy using KNN Model
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(X_train_dtm, y_train)
y_pred = KNN.predict(X_test_dtm)
print('\nK Nearest Neighbors (NN = 3)')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

#Naive Bayes Analysis
tokens_words = vect.get_feature_names()
print('\nAnalysis')
print('No. of tokens: ',len(tokens_words))
counts = NB.feature_count_
df_table = {'Token':tokens_words,'Negative': counts[0,:],'Positive': counts[1,:]}
tokens = pd.DataFrame(df_table, columns= ['Token','Positive','Negative'])
positives = len(tokens[tokens['Positive']>tokens['Negative']])
print('No. of positive tokens: ',positives)
print('No. of negative tokens: ',len(tokens_words)-positives)
#Check positivity/negativity of specific tokens
token_search = ['awesome']
print('\nSearch Results for token/s:',token_search)
print(tokens.loc[tokens['Token'].isin(token_search)])
#Analyse False Negatives (Actual: 1; Predicted: 0)(Predicted negative review for a positive review) 
print(X_test[ y_pred < y_test ])
#Analyse False Positives (Actual: 0; Predicted: 1)(Predicted positive review for a negative review) 
print(X_test[ y_pred > y_test ])



Naive Bayes
Accuracy Score: 87.63797761482053%
Confusion Matrix: 
[[ 8737  1947]
 [ 1256 13970]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Logistic Regression
Accuracy Score: 88.68776534156696%
Confusion Matrix: 
[[ 8912  1772]
 [ 1159 14067]]





Support Vector Machine
Accuracy Score: 88.13585488228483%
Confusion Matrix: 
[[ 8842  1842]
 [ 1232 13994]]

K Nearest Neighbors (NN = 3)
Accuracy Score: 80.96873793901969%
Confusion Matrix: 
[[ 6738  3946]
 [  985 14241]]

Analysis
No. of tokens:  7126
No. of positive tokens:  1463
No. of negative tokens:  5663

Search Results for token/s: ['awesome']
       Token  Positive  Negative
726  awesome    1963.0      65.0
30169     A little concerned with the latest update, and...
124902                      Won't seem to let me buy in GBP
128557    Perfect app. No bugs, nothing hard to use or f...
102844    You should add an open ended history time lock...
111191                Transactions  for bit coins take long
                                ...                        
4773      The update messed it up. Keep saying internet ...
8354                              Meri app open nahi ho Rai
125152    I have a hard time with following the graph al...
92890     App very well done but widge

