In [1]:
import re
import string
import numpy as np
import html
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer # Used for stemming
from sklearn.model_selection import train_test_split # Train-test split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Create pandas dataframe 'amazon_df' from 'amazon.csv' file
amazon_df = pd.read_csv("amazon.csv")

In [3]:
# Take a look at the support of each score in the dataframe
amazon_df['Score'].value_counts()

5    29775
4     8384
1     5021
3     3902
2     2918
Name: Score, dtype: int64

## Text Preprocessing

In [4]:
# Text-cleaning function
def clean_and_reform_data(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove punctuation marks
    text = text.lower() # make lowercase
    text = html.unescape(text) # unescape html characters
    return text

# Apply the text-cleaning function to the 'Text' column of the dataframe
amazon_df['CleanText'] = amazon_df['Text'].apply(clean_and_reform_data)

In [5]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
amazon_df['WithoutStop'] = amazon_df['CleanText'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [6]:
# Use English stemmer
stemmer = SnowballStemmer("english")

# Apply stemming
amazon_df['StemmedText'] = amazon_df['WithoutStop'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
amazon_df.head() # Take a look at the altered dataframe

Unnamed: 0,Text,Score,CleanText,WithoutStop,StemmedText
0,The description and photo on this product need...,3,the description and photo on this product need...,description photo product needs changed indica...,descript photo product need chang indic produc...
1,This was a great book!!!! It is well thought t...,5,this was a great book it is well thought throu...,great book well thought easily imagine events ...,great book well thought easili imagin event ha...
2,"I am a first year teacher, teaching 5th grade....",5,i am a first year teacher teaching 5th grade i...,first year teacher teaching 5th grade special ...,first year teacher teach 5th grade special rea...
3,I got the book at my bookfair at school lookin...,5,i got the book at my bookfair at school lookin...,got book bookfair school looking something sum...,got book bookfair school look someth summer tu...
4,Hi! I'm Martine Redman and I created this puzz...,5,hi i m martine redman and i created this puzzl...,hi martine redman created puzzle briarpatch us...,hi martin redman creat puzzl briarpatch use gr...


In [7]:
# Splitting each text into a list of words
texts_splitted = []
for text in amazon_df['StemmedText']:
    temp = text.split()
    texts_splitted.append(temp)

## Create a Word Embeddings model

In [8]:
# Creating the Word2Vec model
word2vec_model = Word2Vec(sentences = texts_splitted, vector_size = 150, workers = 6) # take the texts_splitted

In [9]:
# Create the word-text vectors
def create_doc_vectors(data_tokens, word2vec_model):
    features = []
    for tokens in data_tokens:
        zero_vector = np.zeros(word2vec_model.vector_size)
        vectors = []
        for token in tokens:
            if token in word2vec_model.wv:
                vectors.append(word2vec_model.wv[token])
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis = 0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

vectorized_doc = create_doc_vectors(texts_splitted, word2vec_model) # vectorized texts

## Method 1: Multi-label classification with 5 labels

In [10]:
# Define training and testing data
# Testing data is the 20% of the overall data, and training the 80%
X_train, X_test, y_train, y_test = train_test_split(vectorized_doc, amazon_df['Score'], test_size=0.2, stratify=amazon_df['Score'], random_state=42)

In [11]:
# Using GridSearchCV to run our RandomForest model for different combinations of parameters
# and find the best combination of parameters, based on acccuracy score

# Parameters to be tested
param_grid = {
    'n_estimators': [100, 200, 250],
    'max_depth': [20, 40, 80, None]
}

rfc = RandomForestClassifier(random_state=42, n_jobs=-1)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             param_grid={'max_depth': [20, 40, 80, None],
                         'n_estimators': [100, 200, 250]})

In [12]:
# Function for displaying the results of GridSearchCV
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean, std, params in zip(mean_score, std_score, params):
        print(f'{round(mean, 3)} + or -{round(std, 3)} for the {params}')

display(CV_rfc)

Best parameters are: {'max_depth': 40, 'n_estimators': 200}


0.621 + or -0.001 for the {'max_depth': 20, 'n_estimators': 100}
0.622 + or -0.001 for the {'max_depth': 20, 'n_estimators': 200}
0.621 + or -0.001 for the {'max_depth': 20, 'n_estimators': 250}
0.622 + or -0.002 for the {'max_depth': 40, 'n_estimators': 100}
0.622 + or -0.001 for the {'max_depth': 40, 'n_estimators': 200}
0.622 + or -0.002 for the {'max_depth': 40, 'n_estimators': 250}
0.621 + or -0.002 for the {'max_depth': 80, 'n_estimators': 100}
0.622 + or -0.002 for the {'max_depth': 80, 'n_estimators': 200}
0.622 + or -0.001 for the {'max_depth': 80, 'n_estimators': 250}
0.621 + or -0.002 for the {'max_depth': None, 'n_estimators': 100}
0.622 + or -0.002 for the {'max_depth': None, 'n_estimators': 200}
0.622 + or -0.001 for the {'max_depth': None, 'n_estimators': 250}


In [13]:
# Create a RandomForest model with the best combination of parameters
clf = RandomForestClassifier(n_estimators=CV_rfc.best_params_['n_estimators'],
                             max_depth=CV_rfc.best_params_['max_depth'], random_state=42, n_jobs=-1)
# Fit the training data
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=40, n_estimators=200, n_jobs=-1,
                       random_state=42)

In [14]:
# Predict the score of the testing data
y_pred = clf.predict(X_test)

# View the classification report for testing data and predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.66      0.30      0.41      1004
           2       1.00      0.02      0.03       584
           3       0.65      0.02      0.04       780
           4       0.48      0.04      0.08      1677
           5       0.63      0.99      0.77      5955

    accuracy                           0.63     10000
   macro avg       0.69      0.27      0.27     10000
weighted avg       0.63      0.63      0.52     10000



In [15]:
# View accuracy score
print(accuracy_score(y_test, y_pred)*100)

63.080000000000005


## Method 2: Trying to classify the positivity of Text 

In [16]:
# Encode each score into 0, 1, or 2, indicating rating positivity/negativity
def encodeLabel(score):
    if score == 5 or score == 4: # If the score is generally positive
        return 2 # Give the score 2 for positive
    if score == 3: # If the score is generally neutral
        return 1 # Give the score 1 for neutral
    return 0 # Give the score 0 for negative

# Give the labels a name
labels = ["Negative", "Neutral", "Positive"]

# Encode the scores to positivity labels
amazon_df["EncodedScore"] = amazon_df["Score"].apply(encodeLabel)

amazon_df.head() # Take a look at the dataframe with the new labels

Unnamed: 0,Text,Score,CleanText,WithoutStop,StemmedText,EncodedScore
0,The description and photo on this product need...,3,the description and photo on this product need...,description photo product needs changed indica...,descript photo product need chang indic produc...,1
1,This was a great book!!!! It is well thought t...,5,this was a great book it is well thought throu...,great book well thought easily imagine events ...,great book well thought easili imagin event ha...,2
2,"I am a first year teacher, teaching 5th grade....",5,i am a first year teacher teaching 5th grade i...,first year teacher teaching 5th grade special ...,first year teacher teach 5th grade special rea...,2
3,I got the book at my bookfair at school lookin...,5,i got the book at my bookfair at school lookin...,got book bookfair school looking something sum...,got book bookfair school look someth summer tu...,2
4,Hi! I'm Martine Redman and I created this puzz...,5,hi i m martine redman and i created this puzzl...,hi martine redman created puzzle briarpatch us...,hi martin redman creat puzzl briarpatch use gr...,2


In [17]:
# Define training and testing data
# Testing data is the 20% of the overall data, and training the 80%
x_train, x_test, y_train, y_test = train_test_split(vectorized_doc, amazon_df['EncodedScore'], test_size=0.2, stratify=amazon_df['EncodedScore'], random_state=42)

In [18]:
# Create a RandomForest model with the same parameters as before
rfc = RandomForestClassifier(n_estimators=CV_rfc.best_params_['n_estimators'],
                             max_depth=CV_rfc.best_params_['max_depth'], random_state=42, n_jobs=-1)
# Fit the training data
rfc.fit(x_train, y_train)

RandomForestClassifier(max_depth=40, n_estimators=200, n_jobs=-1,
                       random_state=42)

In [19]:
# Predict the score of the testing data
y_pred = rfc.predict(x_test)

# View the classification report for test data and predictions
print(classification_report(y_test, y_pred, target_names = labels)) # Print a classification report

              precision    recall  f1-score   support

    Negative       0.78      0.25      0.38      1588
     Neutral       0.93      0.02      0.03       780
    Positive       0.80      0.99      0.89      7632

    accuracy                           0.80     10000
   macro avg       0.84      0.42      0.43     10000
weighted avg       0.81      0.80      0.74     10000



In [20]:
# View accuracy score
print(accuracy_score(y_test, y_pred)*100)

79.82000000000001
