<a href="https://colab.research.google.com/github/mattyjue/instrument_review_text_classification/blob/main/instrument_review_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

The goal of this project is to use nlp and text classification to classify instrument reviews from Amazon. Dataset can be found at https://www.kaggle.com/eswarchandt/amazon-music-reviews?select=Musical_instruments_reviews.csv

# Imports

In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import spacy
import nltk
import pandas as pd




# Load data

In [2]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Musical_instruments_reviews.csv')

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


# Data cleaning and EDA

These columns won't be useful for the classification.

In [4]:
df=df.drop(columns=['reviewerID', 'asin', 'unixReviewTime', 'helpful', 'summary', 'reviewTime', 'reviewerName'], errors='ignore')

In [5]:
df.isna().sum()

reviewText    7
overall       0
dtype: int64

In [6]:
df.shape

(10261, 2)

There aren't very many nans so we'll just drop all nans.

In [7]:
df=df.dropna()

In [8]:
df.shape


(10254, 2)

In [9]:
df.head()

Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5.0
1,The product does exactly as it should and is q...,5.0
2,The primary job of this device is to block the...,5.0
3,Nice windscreen protects my MXL mic and preven...,5.0
4,This pop filter is great. It looks and perform...,5.0


In [10]:
og_df= df.copy()


# Normalize and Vectorize text

In [11]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## This is just prep work for the preprocess function

In [12]:
df['reviewText'][0]

"Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,"

In [13]:
  lemmatizer = WordNetLemmatizer()
  stemmer = SnowballStemmer('english')
  preprocessed = []

  tokenized = word_tokenize(df['reviewText'][0])

  cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) 
            for token in tokenized 
            if token.lower() not in stopwords.words('english') 
            if token.isalpha()]

  print(' '.join(cleaned))

much write exact suppos filter pop sound record much crisp one lowest price pop filter amazon might well buy honest work despit price


## Making the actual preprocess function

In [14]:
def preprocess(text): 
  lemmatizer = WordNetLemmatizer()
  stemmer = SnowballStemmer('english')
  preprocessed = []

  tokenized = word_tokenize(text)

  cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) 
            for token in tokenized 
            if token.lower() not in stopwords.words('english') 
            if token.isalpha()]
  return ' '.join(cleaned)

In [15]:
df['reviewText']=df['reviewText'].apply(preprocess)
df.head()

Unnamed: 0,reviewText,overall
0,much write exact suppos filter pop sound recor...,5.0
1,product exact quit realiz doubl screen arriv e...,5.0
2,primari job devic block breath would otherwis ...,5.0
3,nice windscreen protect mxl mic prevent pop th...,5.0
4,pop filter great look perform like studio filt...,5.0


## Vectorizing

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=2, use_idf=True, norm=u'l2', smooth_idf=True)

X = vectorizer.fit_transform(df["reviewText"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, df[["reviewText", "overall"]]], axis=1)

## making the new_df

In [17]:
new_df=pd.concat(
    (
        df,
        tfidf_df,
    ),
    axis=1,
)

In [18]:
new_df.shape

(10261, 6570)

In [19]:
# to-do: fix nan issue

new_df=new_df.dropna()

## Create rating categories

In [20]:
def rating_cat(rating):
  if (rating == 1) or (rating==2):
    return 'negative'
  elif (rating == 3) or (rating==4):
    return 'neutral'
  else:
    return 'positive'

In [21]:
new_df['overall']=new_df['overall'].apply(rating_cat)

In [22]:
new_df.overall.value_counts(normalize=True)

positive    0.676198
neutral     0.278228
negative    0.045574
Name: overall, dtype: float64

In [23]:
og_df['overall'].value_counts(normalize=True)

5.0    0.676029
4.0    0.203140
3.0    0.075288
2.0    0.024381
1.0    0.021162
Name: overall, dtype: float64

In [24]:
new_df=new_df.drop(columns=['reviewText'])

In [25]:
new_df.head()

Unnamed: 0,overall,aa,aaa,ab,abalon,abehring,abil,abit,abl,ableton,abnorm,abras,abroad,absolut,absorb,absurd,abund,abus,ac,accent,accentu,accept,acces,accesori,access,accessori,accid,accident,acclim,accommod,accomod,accompani,accomplish,accord,account,accoust,accumul,accur,accuraci,accustom,...,yeti,yield,yike,ymmv,yo,yoga,yoke,york,yorker,youll,young,younger,youngster,your,yout,youth,youtub,ypg,yr,yt,yucki,yup,zager,zakk,zap,zappa,zep,zero,zildjian,zillion,zinc,zing,zingi,zip,ziploc,ziplock,zipper,zone,zoom,zt
0,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,positive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split train and test data

## Also fixing over sampling issue

In [26]:
X=new_df.drop(columns='overall')
y=new_df['overall']

# Fixing over Sampling
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

# checking to make sure classes are balanced
y_list= list(y)
print(y_list.count('neutral'))
print(y_list.count('positive'))
print(y_list.count('negative'))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)



6929
6929
6929


# Running the Models

In [27]:
pipeline = Pipeline([
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=3)),
])
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

confusion_mat=confusion_matrix(y_test, predictions)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Negative", "actual_Neutral", "actual_Positive"],
    columns=["pred_Negative", "pred_Neutral", "pred_Positive"])
display(confusion_df)

              precision    recall  f1-score   support

    negative       0.62      0.51      0.56      1386
     neutral       0.53      0.23      0.32      1386
    positive       0.46      0.80      0.58      1386

    accuracy                           0.51      4158
   macro avg       0.54      0.51      0.49      4158
weighted avg       0.54      0.51      0.49      4158



Unnamed: 0,pred_Negative,pred_Neutral,pred_Positive
actual_Negative,709,143,534
actual_Neutral,287,317,782
actual_Positive,143,136,1107


In [28]:
pipeline = Pipeline([
    ('clf', XGBClassifier()),
])
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

confusion_mat=confusion_matrix(y_test, predictions)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Negative", "actual_Neutral", "actual_Positive"],
    columns=["pred_Negative", "pred_Neutral", "pred_Positive"])
display(confusion_df)

              precision    recall  f1-score   support

    negative       0.85      0.71      0.78      1386
     neutral       0.67      0.50      0.57      1386
    positive       0.56      0.79      0.66      1386

    accuracy                           0.67      4158
   macro avg       0.69      0.67      0.67      4158
weighted avg       0.69      0.67      0.67      4158



Unnamed: 0,pred_Negative,pred_Neutral,pred_Positive
actual_Negative,986,119,281
actual_Neutral,111,697,578
actual_Positive,59,232,1095


In [29]:
pipeline = Pipeline([
    ('clf', KNeighborsClassifier()),
])

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

confusion_mat=confusion_matrix(y_test, predictions)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Negative", "actual_Neutral", "actual_Positive"],
    columns=["pred_Negative", "pred_Neutral", "pred_Positive"])
display(confusion_df)

              precision    recall  f1-score   support

    negative       0.64      1.00      0.78      1386
     neutral       0.60      0.86      0.71      1386
    positive       1.00      0.01      0.03      1386

    accuracy                           0.62      4158
   macro avg       0.75      0.62      0.50      4158
weighted avg       0.75      0.62      0.50      4158



Unnamed: 0,pred_Negative,pred_Neutral,pred_Positive
actual_Negative,1386,0,0
actual_Neutral,198,1188,0
actual_Positive,588,779,19


In [30]:
pipeline = Pipeline([
    ('clf', LogisticRegression()),
])

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

confusion_mat=confusion_matrix(y_test, predictions)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Negative", "actual_Neutral", "actual_Positive"],
    columns=["pred_Negative", "pred_Neutral", "pred_Positive"])
display(confusion_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

    negative       0.85      0.97      0.91      1386
     neutral       0.67      0.68      0.67      1386
    positive       0.68      0.57      0.62      1386

    accuracy                           0.74      4158
   macro avg       0.73      0.74      0.73      4158
weighted avg       0.73      0.74      0.73      4158



Unnamed: 0,pred_Negative,pred_Neutral,pred_Positive
actual_Negative,1348,11,27
actual_Neutral,99,938,349
actual_Positive,136,461,789


# Model Summaries

* Random Forest accuracy: 50%
* XGBClassifier accuracy: 66%
* KNeighborsClassifier: 62%
* LogisticRegression: 73%


In [31]:
print(list(y_test).count('positive'))
print(list(y_test).count('neutral'))
print(list(y_test).count('negative'))


1386
1386
1386
