In [13]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy.random import seed # for reproducible results
seed(1)
from tensorflow import set_random_seed # TensorFlow has its own random number generator
set_random_seed(2)

In [2]:
# read data
df = pd.read_csv('Musical_instruments_reviews.csv')
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [3]:
Feature_Columns = ['reviewText', 'overall']
new_df = df[Feature_Columns]
print(new_df.head())

                                          reviewText  overall
0  Not much to write about here, but it does exac...      5.0
1  The product does exactly as it should and is q...      5.0
2  The primary job of this device is to block the...      5.0
3  Nice windscreen protects my MXL mic and preven...      5.0
4  This pop filter is great. It looks and perform...      5.0


In [4]:
nan_values = pd.isna(new_df).sum()
print(f'There are {nan_values[0]} NaN values in {new_df.columns[0]}.')
print(f'There are {nan_values[1]} NaN values in {new_df.columns[1]}.')

There are 7 NaN values in reviewText.
There are 0 NaN values in overall.


In [5]:
new_df = new_df.dropna()

In [6]:
# 0 = bad review
# 1 = good review

# convert to binary target
def to_binary(df):
    replacements = {1:0,
                    2:0,
                    3:0,
                    4:1,
                    5:1}
    
    #if statement so it applies the replace just the first time
    if df['overall'].unique().sum() > 1:
        df['overall'].replace(replacements, inplace=True)    
    return df

new_df = to_binary(new_df)

target_numbers = new_df['overall'].value_counts()

print(target_numbers[0])
print(target_numbers[1])

1239
9015


In [7]:
#extract portion of positives equal to the size of negatives and shuffle
balanced_df = new_df.loc[new_df['overall']==1].sample(n=target_numbers[0], random_state=0)

#append the negatives and shuffle
balanced_df = balanced_df.append(new_df.loc[new_df['overall']==0]).sample(frac=1)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Create the bag of words feature matrix
count = CountVectorizer(stop_words="english", token_pattern=r'\b[^\d\W]+\b') #token pattern removes numbers

X = count.fit_transform(balanced_df['reviewText'])
y = balanced_df['overall']

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# stop annoying warnings
import warnings
warnings.filterwarnings("ignore")

# Create logistic regression
logistic = LogisticRegression()

# Create range of candidate penalty hyperparameter values
penalty = ['l1', 'l2']

# Create range of candidate regularization hyperparameter values
C = np.logspace(0, 4, 10)

# Create dictionary hyperparameter candidates
hyperparameters = dict(C=C, penalty=penalty)

# Create grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

# Fit grid search
model = gridsearch.fit(X, y)

In [10]:
print('Best Penalty:', model.best_estimator_.get_params()['penalty'])
print('Best C:', model.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 1.0


In [15]:
# prepare original df to predict values and get accuracy
X_original = count.transform(new_df['reviewText'])
y_original = new_df['overall']

y_pred = model.predict(X_original)

from sklearn.metrics import accuracy_score
print(f'Accuracy: {round(accuracy_score(y_original, y_pred)*100,1)} %')

Accuracy: 78.2 %


In [19]:
from sklearn.metrics import confusion_matrix

# create confusion matrix
matrix = confusion_matrix(y_original, y_pred)

print('Confusion Matrix:')
print(matrix)

Confusion Matrix:
[[1221   18]
 [2218 6797]]


- 1221 bad reviews classified as bad
- 6797 good reviews classified as good
- 18 bad reviews classified as good
- 2218 good reviews classified as bad

In [21]:
# make prediction in particular
text = 'i like it'
text = count.transform([text])

y_labels = ['NEGATIVE', 'POSITIVE']
result = y_labels[int(model.predict(text))]

print(f'The predicted value is: {result}')




The predicted value is: POSITIVE


# Conclusion

Altough there are better ways of classifying text, I wanted to maintain things as simple as possible in order to understand how to work with text.

The accuracy achieved in the original dataset was of 78,2 %, which I think is acceptable taking into account the simplicity of this code.

The code tends to classify everything as bad review, this is because we lost data when taking just a part of the original dataset in order to balance it.
