In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import collections
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
#loading dataset
df=pd.read_csv("../input/amazon-music-reviews/Musical_instruments_reviews.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.isna().mean()

Only 0.2% of reviewer name missing so we will drop the entire row of them

Only 0.068% of review text missing so we will drop the entire row of them


In [None]:
#drop missing target rows
missing_target_rows=df[df['reviewerName'].isna()].index
#axis=0 since we're dropping rows
df=df.drop(missing_target_rows,axis=0).reset_index(drop=True)

In [None]:
#drop missing target rows
missing_target_rows=df[df['reviewText'].isna()].index
#axis=0 since we're dropping rows
df=df.drop(missing_target_rows,axis=0).reset_index(drop=True)

In [None]:
df.info()

Let's find the number of unique values for each features

In [None]:
#dictionary that maps column name to the number of unique values in the column
{column: len(df[column].unique()) for column in df.select_dtypes('object').columns}

We have to categorize review based on Common sentiments used by reviewers

In [None]:
data=pd.DataFrame(df)

# as we are going to change the values of overall to boolean we copied the dataset to another one

**We will preprocess common_rev dataset to:**
1. convert text to lower case
2. remove all non-word characters
3. remove all punctuations

In [None]:
df['common_text']=df['reviewText']+ ' '+df['summary']

In [None]:
common_rev=df[['common_text','overall']]

In [None]:
common_rev.head()

**Here we mark 1 as good and 0 as bad reviews**
**i.e. if rating is <=3 then 0 else 1**

In [None]:
common_rev['overall']=np.where(common_rev['overall']>=3,1,0)


In [None]:
common_rev.head()

Splitting the good and bad reviews

In [None]:

good_words = common_rev[common_rev['overall'] == 1].common_text
bad_words  = common_rev[common_rev['overall'] == 0].common_text

**Here we will use Word Cloud which is a data visualization technique used to represent text data in which the size of each word indicates its frequency or importance. Significant textual data points can be highlighted using a word cloud. Word clouds are widely used for analyzing data from social network websites.**

In [None]:
from wordcloud import WordCloud,STOPWORDS

comment_words = ''
stopwords = set(STOPWORDS)
 
for val in good_words:
     
    # typecaste each val to string
    val = str(val)
 
    # split the value
    tokens = val.split()
     
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
 
# plot the WordCloud image                      
plt.figure(figsize = (20, 20), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
print('Total ratings per rating:','\n',data.overall.value_counts())

#Number of unique instrument ids
print('Number of unique instruments:',len(data.asin.unique()))
print('Number of rows:',data.shape[0]) 

**We only require 3 columns to work with for Sentiment analysis**
so from the dataset data that we created earlier we will drop all columns, except reviewText, reviewTime, and the overall sentiment 0 or 1


In [None]:
data['reviewText'] = data['reviewText'] + data['summary']
data = data.drop(['summary'], axis = 1)

In [None]:
data.head()

In [None]:
data2=data.drop(columns=['reviewerID','asin', 'reviewerName','overall','helpful','unixReviewTime','common_text'])

In [None]:
data2.head()

In [None]:
add=common_rev["overall"]
data2=data2.join(add)

In [None]:
data2=data2.rename(columns={"overall":"overall_sentiment"})


In [None]:
data2.head()

**Developing the SENTIMENT CLASSIFIER**

In [None]:
data2.reviewText = data2.reviewText.astype('str')
X = data2['reviewText']
y = data2['overall_sentiment']


In [None]:
X.shape


In [None]:
y.shape

splitting the dataset so that there is a training set and a test set.


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2 , random_state = 0)


Using CountVectorizer to develop a vector of all the words in the string. Importing CountVectorizer and fitting both our training, testing data into it.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
ctmTr = cv.fit_transform(X_train)
X_test_ctm = cv.transform(X_test)

print('ctmTr:',ctmTr.shape)
print('X_test_dtm:',X_test_dtm.shape)

Also using TfidfVectorizer to develop a vector of all the words in the string

In [None]:
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

tfidf_x_train =tv.fit_transform(X_train)
tfidf_x_test =tv.transform(X_test)

print('tfidf_x_train:',tfidf_x_train.shape)
print('tfidf_x_test:',tfidf_x_test.shape)

**Now fitting the models, Prediction and Calculating accuracy of the models**

**Using LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
#lr
lr = LogisticRegression(random_state=0)

#fit
ctm = lr.fit(ctmTr, y_train)
tfidf = lr.fit(tfidf_x_train, y_train)

#predict
ctm_predict= lr.predict(X_test_ctm)
tfidf_predict = lr.predict(tfidf_x_test)

#accuracy
lr_ctm = accuracy_score(y_test, ctm_predict)
lr_tfidf = accuracy_score(y_test,tfidf_predict)

print('lr ctm accuracy:', lr_ctm)
print('lr tfidf accuracy:', lr_tfidf)

Using Logistic Regression we achive **95.6% accuracy** which is quite good!

In [None]:
#random forest
rf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)

#fit
ctm = rf.fit(ctmTr, y_train)
tfidf = rf.fit(tfidf_x_train, y_train)


#predict
ctm_predict= rf.predict(X_test_ctm)
tfidf_predict = rf.predict(tfidf_x_test)

#accuracy
rf_ctm = accuracy_score(y_test, ctm_predict)
rf_tfidf = accuracy_score(y_test,tfidf_predict)

print('Random Forest ctm accuracy:', rf_ctm)
print('Random Forest tfidf accuracy:', rf_tfidf)


Using Random Forest we achive **95.6% accuracy** which is quite good!

**Using Support Vector Classification**

In [None]:
#Linear SVC
ls =  LinearSVC()

#fit
ctm = ls.fit(ctmTr, y_train)
tfidf = ls.fit(tfidf_x_train, y_train)


#predict
ctm_predict= ls.predict(X_test_ctm)
tfidf_predict = ls.predict(tfidf_x_test)

#accuracy
ls_ctm = accuracy_score(y_test, ctm_predict)
ls_tfidf = accuracy_score(y_test,tfidf_predict)

print('Support Vector Classification ctm accuracy:', ls_ctm)
print('Support Vector Classification tfidf accuracy:', ls_tfidf)

Using Random Forest we achive **95.7% accuracy** which is better!


**Using Naive Baiyes**

In [None]:
nb = MultinomialNB()

#fit
ctm = nb.fit(ctmTr, y_train)
tfidf = nb.fit(tfidf_x_train, y_train)


#predict
ctm_predict= nb.predict(X_test_ctm)
tfidf_predict = nb.predict(tfidf_x_test)

#accuracy
nb_ctm = accuracy_score(y_test, ctm_predict)
nb_tfidf = accuracy_score(y_test,tfidf_predict)

print('Naive Baiyes ctm accuracy:', nb_ctm)
print('Naive Baiyes tfidf accuracy:', nb_tfidf)

Using Naive Baiyes we get almost similar result, **95.6% accuracy**