# Import Libraries


In [32]:
#Basic libraries
import re
import pandas as pd 
import numpy as np 


# nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 

#Metrics libraries

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
from sklearn.metrics import classification_report


#Visualization libraries
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot
%matplotlib inline

#Keras

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

#Ignore warnings
import warnings

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Importing The Dataset

In [2]:
raw_reviews = pd.read_csv('Musical_instruments_reviews.csv', engine ="python")
## print shape of dataset with rows and columns and information 
print ("The shape of the  data is (row, column):"+ str(raw_reviews.shape))
print (raw_reviews.info())

The shape of the  data is (row, column):(10261, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   reviewerID      10261 non-null  object 
 1   asin            10261 non-null  object 
 2   reviewerName    10234 non-null  object 
 3   helpful         10261 non-null  object 
 4   reviewText      10254 non-null  object 
 5   overall         10261 non-null  float64
 6   summary         10261 non-null  object 
 7   unixReviewTime  10261 non-null  int64  
 8   reviewTime      10261 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 721.6+ KB
None


In [3]:
raw_reviews.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


Concatenating review text and summary


In [4]:
raw_reviews['reviews']=raw_reviews['reviewText']+raw_reviews['summary']
raw_reviews=raw_reviews.drop(['reviewText', 'summary'], axis=1)
raw_reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,reviewTime,reviews
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",5.0,1393545600,"02 28, 2014","Not much to write about here, but it does exac..."
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",5.0,1363392000,"03 16, 2013",The product does exactly as it should and is q...
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",5.0,1377648000,"08 28, 2013",The primary job of this device is to block the...
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",5.0,1392336000,"02 14, 2014",Nice windscreen protects my MXL mic and preven...
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",5.0,1392940800,"02 21, 2014",This pop filter is great. It looks and perform...


Creating a dataframe with the reviews and the corresponding rating

In [5]:
df=raw_reviews[['reviews','overall']]
df.head()

Unnamed: 0,reviews,overall
0,"Not much to write about here, but it does exac...",5.0
1,The product does exactly as it should and is q...,5.0
2,The primary job of this device is to block the...,5.0
3,Nice windscreen protects my MXL mic and preven...,5.0
4,This pop filter is great. It looks and perform...,5.0


Removing all the records with rating 3

In [6]:
df= df[df['overall']!=3]
df.shape

(9489, 2)

# Preprocessing and cleaning

Creating a copy

In [7]:
process_reviews=df.copy()

In [8]:
stop_words=set(nltk.corpus.stopwords.words('english'))

 Handling NaN values

In [9]:
process_reviews.isnull().sum()

reviews    7
overall    0
dtype: int64

In [10]:
process_reviews=process_reviews.dropna()

In [11]:
process_reviews.isnull().sum()

reviews    0
overall    0
dtype: int64

Creating 'sentiment' column

In [12]:
process_reviews['overall'].value_counts()

5.0    6932
4.0    2083
2.0     250
1.0     217
Name: overall, dtype: int64

In [13]:
process_reviews['overall'] = process_reviews['overall'].astype(float)

In [14]:
ratings = process_reviews['overall'].unique().tolist()
ratings

[5.0, 4.0, 2.0, 1.0]

In [15]:
process_reviews['overall'].value_counts()

5.0    6932
4.0    2083
2.0     250
1.0     217
Name: overall, dtype: int64

In [16]:
process_reviews['overall'] = process_reviews['overall'].apply(lambda x : 1 if x>3 else 0)

In [17]:
process_reviews['overall'].value_counts()

1    9015
0     467
Name: overall, dtype: int64

In this step, following operations are performed on the review text

Removing website links

Removing html tags

Decontracting(expanding from the original form)

Removing the words with numeric digits

Removing non-word characters

Converting to lower case

Removing stop words

Performing Lemmatization


In [18]:
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [19]:
lemmatizer = WordNetLemmatizer()
def preprocess_text(review):
    review = re.sub(r"http\S+", "", review)            
     
    review = decontract(review)                        
    review = re.sub("\S*\d\S*", "", review).strip()     
    review = re.sub('[^A-Za-z]+', ' ', review)          
    review = review.lower()                             
    review = [word for word in review.split(" ") if not word in stop_words]
    review = [lemmatizer.lemmatize(token, "v") for token in review] #Lemmatization
    review = " ".join(review)
    review.strip()
    return review

In [20]:
process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x: preprocess_text(str(x)))

In [21]:
X = []
sentences = list(process_reviews['reviews'])
for sen in sentences:
    X.append((sen))

In [22]:
X[2]

'primary job device block breath would otherwise produce pop sound allow voice pass noticeable reduction volume high frequencies double cloth filter block pop let voice coloration metal clamp mount attach mike stand secure enough keep attach goose neck need little coax stay put job well'

In [23]:
y = np.array(process_reviews['overall'])


In [24]:
y[2]

1


# Train-test split(80:20)

In [25]:
train_df, test_df = train_test_split(process_reviews, test_size = 0.2, random_state = 42)
print("Training data size : ", train_df.shape)
print("Test data size : ", test_df.shape)

Training data size :  (7585, 2)
Test data size :  (1897, 2)


# Model Building¶


In [26]:
top_words = 6000
tokenizer = Tokenizer(num_words=top_words)
tokenizer.fit_on_texts(train_df['reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(train_df['reviews'])

max_review_length = 130
X_train = pad_sequences(list_tokenized_train, maxlen=max_review_length)
y_train = train_df['overall']

In [27]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words+1, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           192032    
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 274,593
Trainable params: 274,593
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.fit(X_train,y_train, epochs=3, batch_size=64, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f72703ea6d0>

In [29]:
list_tokenized_test = tokenizer.texts_to_sequences(test_df['reviews'])
X_test = pad_sequences(list_tokenized_test, maxlen=max_review_length)
y_test = test_df['overall']
y_test = test_df['overall']
prediction = model.predict(X_test)
y_pred = (prediction > 0.5)



In [37]:
print("Accuracy of the model : ", accuracy_score(y_pred, y_test))
print('Confusion matrix:')
print(confusion_matrix(y_test,y_pred))
print('Classification report:')
print(classification_report(y_test, y_pred))

Accuracy of the model :  0.9499209277807064
Confusion matrix:
[[  16   81]
 [  14 1786]]
Classification report:
              precision    recall  f1-score   support

           0       0.53      0.16      0.25        97
           1       0.96      0.99      0.97      1800

    accuracy                           0.95      1897
   macro avg       0.74      0.58      0.61      1897
weighted avg       0.93      0.95      0.94      1897

