In [1]:
# import required libraries and methods from them

from platform import python_version

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import re

from bs4 import BeautifulSoup

import contractions

import gensim
import gensim.downloader as api

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mrinalkadam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mrinalkadam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mrinalkadam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# check the python version being used by the jupyter notebook

python_version()

'3.8.5'

In [4]:
# read the input dataset into a dataframe

df = pd.read_csv("data.tsv", sep='\t', quoting=3)
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,37000337,R3DT59XH7HXR9K,B00303FI0G,529320574,Arthur Court Paper Towel Holder,Kitchen,5,0,0,N,Y,Beautiful. Looks great on counter,Beautiful. Looks great on counter.,2015-08-31
1,US,15272914,R1LFS11BNASSU8,B00JCZKZN6,274237558,Olde Thompson Bavaria Glass Salt and Pepper Mi...,Kitchen,5,0,1,N,Y,Awesome & Self-ness,I personally have 5 days sets and have also bo...,2015-08-31
2,US,36137863,R296RT05AG0AF6,B00JLIKA5C,544675303,Progressive International PL8 Professional Man...,Kitchen,5,0,0,N,Y,Fabulous and worth every penny,Fabulous and worth every penny. Used for clean...,2015-08-31
3,US,43311049,R3V37XDZ7ZCI3L,B000GBNB8G,491599489,Zyliss Jumbo Garlic Press,Kitchen,5,0,1,N,Y,Five Stars,A must if you love garlic on tomato marinara s...,2015-08-31
4,US,13763148,R14GU232NQFYX2,B00VJ5KX9S,353790155,"1 X Premier Pizza Cutter - Stainless Steel 14""...",Kitchen,5,0,0,N,Y,Better than sex,Worth every penny! Buy one now and be a pizza ...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4880461,US,51094108,R22DLC2P26MUMR,B00004SBGS,732420532,Le Creuset Enameled Cast-Iron 6-3/4-Quart Oval...,Kitchen,4,30,41,N,N,Not as sturdy as you'd think.,"After a month of heavy use, primarily as a chi...",2000-04-28
4880462,US,50562512,R1N6KLTENLQOMT,B00004SBIA,261705371,Le Creuset Enameled Cast-Iron 2-Quart Heart Ca...,Kitchen,5,84,92,N,N,A Sweetheart of A Pan,I've used my Le Creuset enameled cast iron coo...,2000-04-28
4880463,US,52469742,R10TW4QXDV8KJC,B00004SPEF,191184892,Krups 358-70 La Glaciere Ice Cream Maker,Kitchen,4,55,60,N,N,Ice Cream Like a Dream,"According to my wife, this is \\""the best birt...",2000-04-28
4880464,US,51865238,R41RL2U1FSQ4V,B00004RHR6,912491903,Hoffritz Stainless-Steel Manual Can Opener,Kitchen,4,30,42,N,N,Opens anything and everything,Hoffritz has a name of producing a trendy and ...,2000-04-24


# 1. Dataset Generation 

In [5]:
# keep only reviews and ratings columns

df = df[["review_body","star_rating"]]
df

Unnamed: 0,review_body,star_rating
0,Beautiful. Looks great on counter.,5
1,I personally have 5 days sets and have also bo...,5
2,Fabulous and worth every penny. Used for clean...,5
3,A must if you love garlic on tomato marinara s...,5
4,Worth every penny! Buy one now and be a pizza ...,5
...,...,...
4880461,"After a month of heavy use, primarily as a chi...",4
4880462,I've used my Le Creuset enameled cast iron coo...,5
4880463,"According to my wife, this is \\""the best birt...",4
4880464,Hoffritz has a name of producing a trendy and ...,4


In [6]:
# find out the number of reviews falling under each distinct rating 

df['star_rating'].value_counts()

5    3128564
4     732471
1     427306
3     349929
2     242196
Name: star_rating, dtype: int64

In [7]:
# check for null values in the reviews column

df['review_body'].isnull().sum()

243

In [8]:
# check for null values in the ratings column

df['star_rating'].isnull().sum()

0

In [9]:
# drop null value records from the dataframe

df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [10]:
# find out records with star ratings 1,2,3,4 and 5 and select 50000 records randomly per each rating score

df_1 = df[df['star_rating']==1].sample(n=50000, random_state=100)
df_2 = df[df['star_rating']==2].sample(n=50000, random_state=100)
df_3 = df[df['star_rating']==3].sample(n=50000, random_state=100)
df_4 = df[df['star_rating']==4].sample(n=50000, random_state=100)
df_5 = df[df['star_rating']==5].sample(n=50000, random_state=100)

# concat the above records together to get a sample of 250000 reviews

df = pd.concat([df_1,df_2,df_3,df_4,df_5]).reset_index()

# shuffle the dataset

df = df.sample(frac=1).reset_index()
df.drop(['index','level_0'],axis=1,inplace=True)
df

Unnamed: 0,review_body,star_rating
0,Not pleased with the &#34;threads&#34; it crea...,2
1,fills one 16.9 ounce water bottle with two tr...,4
2,We got a similar waffle iron from Betty Crocke...,3
3,I have Lock N Locks and I hate keeping up with...,2
4,I bought this at the beginning of 9/13 even th...,1
...,...,...
249995,As with all coffee 'grinders' that are actuall...,2
249996,Love the design of this shaker and have ordere...,2
249997,This mold is made of plastic and is a bit flim...,4
249998,I really love this yogurt maker. I made very t...,5


In [11]:
# find out the number of reviews falling under distinct ratings now

print("Positive, Negative, Neutral Reviews Count:")
print(df[((df['star_rating']==4.0) | (df['star_rating']==5.0))]['star_rating'].count(),",",df[((df['star_rating']==1.0) | (df['star_rating']==2.0))]['star_rating'].count(),",",df[df['star_rating']==3.0]['star_rating'].count())

Positive, Negative, Neutral Reviews Count:
100000 , 100000 , 50000


In [12]:
# label reviews falling under ratings 4 and 5 as 1(positive class), under ratings 1 and 2 as 2(negative class), and under rating 3 as 3(neutral class)

df['class'] = np.where(((df['star_rating']==4) | (df['star_rating']==5)),1,0)
df['class'] = np.where(((df['star_rating']==1) | (df['star_rating']==2)),2,df['class'])
df['class'] = np.where((df['star_rating']==3),3,df['class'])
df

Unnamed: 0,review_body,star_rating,class
0,Not pleased with the &#34;threads&#34; it crea...,2,2
1,fills one 16.9 ounce water bottle with two tr...,4,1
2,We got a similar waffle iron from Betty Crocke...,3,3
3,I have Lock N Locks and I hate keeping up with...,2,2
4,I bought this at the beginning of 9/13 even th...,1,2
...,...,...,...
249995,As with all coffee 'grinders' that are actuall...,2,2
249996,Love the design of this shaker and have ordere...,2,2
249997,This mold is made of plastic and is a bit flim...,4,1
249998,I really love this yogurt maker. I made very t...,5,1


In [13]:
# drop the rating column once you have the label('class') column

df.drop(['star_rating'],axis=1,inplace=True)
df

Unnamed: 0,review_body,class
0,Not pleased with the &#34;threads&#34; it crea...,2
1,fills one 16.9 ounce water bottle with two tr...,1
2,We got a similar waffle iron from Betty Crocke...,3
3,I have Lock N Locks and I hate keeping up with...,2
4,I bought this at the beginning of 9/13 even th...,2
...,...,...
249995,As with all coffee 'grinders' that are actuall...,2
249996,Love the design of this shaker and have ordere...,2
249997,This mold is made of plastic and is a bit flim...,1
249998,I really love this yogurt maker. I made very t...,1


In [14]:
# make a copy of the original data frame(without any data cleaning)

df_uncleaned = df.copy(deep = True)
df_uncleaned

Unnamed: 0,review_body,class
0,Not pleased with the &#34;threads&#34; it crea...,2
1,fills one 16.9 ounce water bottle with two tr...,1
2,We got a similar waffle iron from Betty Crocke...,3
3,I have Lock N Locks and I hate keeping up with...,2
4,I bought this at the beginning of 9/13 even th...,2
...,...,...
249995,As with all coffee 'grinders' that are actuall...,2
249996,Love the design of this shaker and have ordere...,2
249997,This mold is made of plastic and is a bit flim...,1
249998,I really love this yogurt maker. I made very t...,1


# 2. Word Embedding

# (a)

In [23]:
# load the google news word2vec model 

wv = api.load('word2vec-google-news-300')

In [24]:
# find out the vectors for different words using the above model

vec_King = wv['King']
vec_Man = wv['Man']
vec_Woman = wv['Woman']
vec_Queen = wv['Queen']

vec_1 = vec_King-vec_Man+vec_Woman
vec_2 = vec_Queen

# find out the similarity of the vectors using 'most_similar' function

print(wv.most_similar(positive=['King','Woman'], negative=['Man'], topn=1))
print('\n')

# find out the similarity of the vectors using cosine similarity 

cosine_similarity = np.dot(vec_1,vec_2)/(np.linalg.norm(vec_1)* np.linalg.norm(vec_2))
print("Semantic(Cosine) similarity between the two vectors is:",cosine_similarity)

[('Queen', 0.4929388165473938)]


Semantic(Cosine) similarity between the two vectors is: 0.44240144


In [25]:
# find out the similarity of the words

print('%r\t%r\t%.2f' % (w1, w2, wv.similarity('excellent', 'outstanding')))

'excellent'	'outstanding'	0.56


# (b)

In [26]:
##### REMOVE FROM COMMENT LATER

words = [row.split(' ') for row in df['review_body']]

# train your own word2vec model

model = gensim.models.Word2Vec(words, min_count=10,size=300,workers=3, window=11, sg=1)

# summarize the loaded model

print(model)

In [27]:
# save model

model.save('model.bin')

# load saved model

final_model = gensim.models.Word2Vec.load('model.bin')
print(final_model)

Word2Vec(vocab=34607, size=300, alpha=0.025)


In [28]:
vec_King = final_model['King']
vec_Man = final_model['Man']
vec_Woman = final_model['Woman']
vec_Queen = final_model['Queen']

vec_1 = vec_King-vec_Man+vec_Woman
vec_2 = vec_Queen

# find out the similarity of the vectors using 'most_similar' function

print(final_model.most_similar(positive=['King','Woman'], negative=['Man'], topn=1))
print('\n')

# find out the similarity of the vectors using cosine similarity 

cosine_similarity = np.dot(vec_1,vec_2)/(np.linalg.norm(vec_1)* np.linalg.norm(vec_2))
print("Semantic(Cosine) similarity between the two vectors is:",cosine_similarity)

[('Arthur', 0.5806456208229065)]


Semantic(Cosine) similarity between the two vectors is: 0.4015107


  vec_King = final_model['King']
  vec_Man = final_model['Man']
  vec_Woman = final_model['Woman']
  vec_Queen = final_model['Queen']
  print(final_model.most_similar(positive=['King','Woman'], negative=['Man'], topn=1))


In [29]:
# find out the similarity of the words

print('%r\t%r\t%.2f' % (w1, w2, final_model.similarity('excellent', 'outstanding')))

'excellent'	'outstanding'	0.67


  print('%r\t%r\t%.2f' % (w1, w2, final_model.similarity('excellent', 'outstanding')))


# Comments about this question

As seen from above,the vectors generated by our word2vec model are able to encode semantic similarities better between the words 'excellent ' and 'outstanding'(Word2Vec-0.67, Google-0.56). However the google model does slighlty better when it comes to the case of 'King - Man + Woman' and 'Queen'(Google-0.44, Word2Vec-0.40). Also it can be noted that the most similar word predicted to 'King - Man + Woman' is 'Queen' by the Google model but 'Arthur' by our model. This might likely be because the Google model has a larger word vocabulary and contains more common words. Also, since  we have taken these parameters for our Word2Vec model -(min_count=10,size=300,workers=3, window=11) , it isn't as refined as it could be potentially, thus leading to slightly low results in some cases.

# 3. Simple models

# Data Cleaning

In [18]:
# convert the reviews column to string type

df['review_body'] = df['review_body'].astype(str)

# convert the reviews column to lower case

df['review_body'] = df['review_body'].str.lower()

# using BeautifulSoup, remove HTML tags from the reviews column

# function to remove HTML tags
def remove_html(string):
  
    # parse through html content
    bs = BeautifulSoup(string, "html.parser")
  
    for text in bs(['style', 'script']):
        # remove the tags
        text.decompose()
  
    # return data by retrieving the tag content
    return ' '.join(bs.stripped_strings)

# apply the remove_html function to the reviews column

df['review_body']=df['review_body'].apply(lambda x : remove_html(x))

# using RegEx, remove URLs from the reviews column

# function to remove URLS
def remove_url(string):
    result = re.sub(r'^https?:\/\/.*[\r\n]*',r' ', string, flags=re.MULTILINE)
    return result

# apply the remove_url function to the reviews column

df['review_body']=df['review_body'].apply(lambda x : remove_url(x))

# using RegEx, remove the characters apart from alphabets and single apostrophe(required for contractions later) from the reviews column and replace them with a single space 

df['review_body'] = df['review_body'].replace(r"[^a-zA-Z' ]\s?"," ",regex=True)

# replace the single apostrophe with no space

df['review_body'] = df['review_body'].replace("'","",regex=True)

# using RegEx, remove the extra spaces between words from the reviews column

df['review_body'] = df['review_body'].replace('\s+', ' ', regex=True)

# using the contractions library, perform contractions on the reviews 

df['review_body'] = df['review_body'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df['review_body'] = [' '.join(map(str, d)) for d in df['review_body']]

df



Unnamed: 0,review_body,class
0,i assumed there were four chargers when i boug...,2
1,my son likes to cook hes especially good with ...,1
2,shipped fast good price they were way huger th...,1
3,containers are great but the lids are very thi...,3
4,item was received broken i returned it and ask...,2
...,...,...
249995,the locks come off easily and they are hard to...,3
249996,i was bummed the carafe is slightly too wide a...,2
249997,I have had this kettle for just over one month...,2
249998,the idea and color of the balloons is enticing...,2


# Pre-processing

In [19]:
# remove all general stop words from the reviews column

stop_words = stopwords.words('english')
df['review_body'] = df['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# perform lemmatization with POS tagging 

whitespace_tokenizer = nltk.tokenize.WhitespaceTokenizer()
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

# funtion to return a POS form of a word
def pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    pos_tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dictionary = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dictionary.get(pos_tag, wordnet.NOUN)

# function to lemmatize the text
def lemmatize_text(string):
    return [wordnet_lemmatizer.lemmatize(w,pos(w)) for w in whitespace_tokenizer.tokenize(string)]

df['review_body'] = df['review_body'].apply(lemmatize_text)
df['review_body'] = [' '.join(map(str, l)) for l in df['review_body']]

df

Unnamed: 0,review_body,class
0,assume four charger bought item pretty bought ...,2
1,son like cook he especially good grill burger ...,1
2,ship fast good price way huger expect,1
3,container great lid thin break easily one use,3
4,item receive broken return ask replacement shi...,2
...,...,...
249995,lock come easily hard clean top,3
249996,bum carafe slightly wide bit short metal struc...,2
249997,I kettle one month leak water leak seal bottom...,2
249998,idea color balloon entice order package child ...,2


In [20]:
# Subtract target class values by 1 so that it becomes easier later on while comparison 

df['class'] = df['class']-1
df

Unnamed: 0,review_body,class
0,assume four charger bought item pretty bought ...,1
1,son like cook he especially good grill burger ...,0
2,ship fast good price way huger expect,0
3,container great lid thin break easily one use,2
4,item receive broken return ask replacement shi...,1
...,...,...
249995,lock come easily hard clean top,2
249996,bum carafe slightly wide bit short metal struc...,1
249997,I kettle one month leak water leak seal bottom...,1
249998,idea color balloon entice order package child ...,1


In [21]:
# function to find the average of vectors as your input feature 

def find_average_of_vectors(review,model_used):

    sentence_words = review.split(" ")
    
    sentence_vectors = []    
    for word in sentence_words:
        try:
            sentence_vectors.append(model_used[word])
        except:
            continue
    
    if len(sentence_vectors)!=0:
        return (np.mean(sentence_vectors,axis=0)).flatten()
    else:
        return np.zeros((300,))

# Binary

In [22]:
# make a copy of the original data frame(with data cleaning)

df_org_3 = df.copy(deep=True)
df_org_3

Unnamed: 0,review_body,class
0,assume four charger bought item pretty bought ...,1
1,son like cook he especially good grill burger ...,0
2,ship fast good price way huger expect,0
3,container great lid thin break easily one use,2
4,item receive broken return ask replacement shi...,1
...,...,...
249995,lock come easily hard clean top,2
249996,bum carafe slightly wide bit short metal struc...,1
249997,I kettle one month leak water leak seal bottom...,1
249998,idea color balloon entice order package child ...,1


In [23]:
# find input feature for google model

df_org_3['avg_input_features_1'] = df_org_3['review_body'].apply(lambda x: find_average_of_vectors(x,wv))
df_org_3    

Unnamed: 0,review_body,class,avg_input_features_1
0,assume four charger bought item pretty bought ...,1,"[0.04277208, -0.03597005, -0.062435575, 0.1046..."
1,son like cook he especially good grill burger ...,0,"[-0.004893621, 0.029286703, -0.01199023, 0.162..."
2,ship fast good price way huger expect,0,"[0.1432408, 0.08569336, -0.048673358, 0.078264..."
3,container great lid thin break easily one use,2,"[0.056274414, 0.10064697, -0.0005340576, 0.056..."
4,item receive broken return ask replacement shi...,1,"[0.043584187, -0.013412476, -0.116475426, 0.06..."
...,...,...,...
249995,lock come easily hard clean top,2,"[0.03120931, 0.07987467, 0.03741455, 0.0357869..."
249996,bum carafe slightly wide bit short metal struc...,1,"[-0.001551011, 0.026309744, -0.06418026, 0.125..."
249997,I kettle one month leak water leak seal bottom...,1,"[0.0027923584, 0.092679344, -0.03684489, 0.028..."
249998,idea color balloon entice order package child ...,1,"[0.047094908, 0.011726828, 0.00012925093, 0.09..."


In [24]:
# find input feature for our model

df_org_3['avg_input_features_2'] = df_org_3['review_body'].apply(lambda x: find_average_of_vectors(x,final_model))
df_org_3

  sentence_vectors.append(model_used[word])


Unnamed: 0,review_body,class,avg_input_features_1,avg_input_features_2
0,assume four charger bought item pretty bought ...,1,"[0.04277208, -0.03597005, -0.062435575, 0.1046...","[0.017703589, -0.11186184, -0.0030522645, -0.0..."
1,son like cook he especially good grill burger ...,0,"[-0.004893621, 0.029286703, -0.01199023, 0.162...","[0.120273024, -0.14361034, 0.046780374, -0.138..."
2,ship fast good price way huger expect,0,"[0.1432408, 0.08569336, -0.048673358, 0.078264...","[-0.049596105, -0.018341891, 0.13302507, -0.17..."
3,container great lid thin break easily one use,2,"[0.056274414, 0.10064697, -0.0005340576, 0.056...","[0.030435072, -0.15327847, 0.11309578, -0.1425..."
4,item receive broken return ask replacement shi...,1,"[0.043584187, -0.013412476, -0.116475426, 0.06...","[0.08915458, -0.22801971, -0.028520422, -0.263..."
...,...,...,...,...
249995,lock come easily hard clean top,2,"[0.03120931, 0.07987467, 0.03741455, 0.0357869...","[0.015699785, -0.12990652, 0.21889718, -0.1027..."
249996,bum carafe slightly wide bit short metal struc...,1,"[-0.001551011, 0.026309744, -0.06418026, 0.125...","[0.015504825, -0.031771064, 0.1092756, -0.0557..."
249997,I kettle one month leak water leak seal bottom...,1,"[0.0027923584, 0.092679344, -0.03684489, 0.028...","[0.020719932, -0.090553395, 0.13070571, -0.027..."
249998,idea color balloon entice order package child ...,1,"[0.047094908, 0.011726828, 0.00012925093, 0.09...","[0.066825956, -0.17564225, 0.05628306, -0.0763..."


In [25]:
# binary classification dataframe

df_binary = df_org_3[((df_org_3['class'] == 0) | (df_org_3['class'] == 1))]
df_binary

Unnamed: 0,review_body,class,avg_input_features_1,avg_input_features_2
0,assume four charger bought item pretty bought ...,1,"[0.04277208, -0.03597005, -0.062435575, 0.1046...","[0.017703589, -0.11186184, -0.0030522645, -0.0..."
1,son like cook he especially good grill burger ...,0,"[-0.004893621, 0.029286703, -0.01199023, 0.162...","[0.120273024, -0.14361034, 0.046780374, -0.138..."
2,ship fast good price way huger expect,0,"[0.1432408, 0.08569336, -0.048673358, 0.078264...","[-0.049596105, -0.018341891, 0.13302507, -0.17..."
4,item receive broken return ask replacement shi...,1,"[0.043584187, -0.013412476, -0.116475426, 0.06...","[0.08915458, -0.22801971, -0.028520422, -0.263..."
5,experience issue one cup fill make sure filter...,0,"[0.0077209473, -0.015841166, -0.04876624, 0.11...","[0.0042549637, -0.026836593, 0.14918885, -0.08..."
...,...,...,...,...
249993,toaster oven fine especially since paid amazon...,1,"[0.03401947, 0.05153087, -0.0007176717, 0.0253...","[0.050901376, -0.11194899, 0.12081799, -0.0080..."
249996,bum carafe slightly wide bit short metal struc...,1,"[-0.001551011, 0.026309744, -0.06418026, 0.125...","[0.015504825, -0.031771064, 0.1092756, -0.0557..."
249997,I kettle one month leak water leak seal bottom...,1,"[0.0027923584, 0.092679344, -0.03684489, 0.028...","[0.020719932, -0.090553395, 0.13070571, -0.027..."
249998,idea color balloon entice order package child ...,1,"[0.047094908, 0.011726828, 0.00012925093, 0.09...","[0.066825956, -0.17564225, 0.05628306, -0.0763..."


# Google Model

In [30]:
x = df_binary['avg_input_features_1']
y = df_binary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

x_train = x_train.tolist()
x_test = x_test.tolist()

In [31]:
# train a Perceptron model on the training dataset

perceptron = Perceptron(n_jobs=-1, random_state=100)
perceptron.fit(x_train,y_train)

# predict the labels of test values

y_test_pred = perceptron.predict(x_test)

# find the accuracy of the Perceptron model on the test set

print("-----------------------Test-----------------------")
print('\n')
print("Accuracy of Perceptron Model:",accuracy_score(y_test, y_test_pred))

-----------------------Test-----------------------


Accuracy of Perceptron Model: 0.710925


In [32]:
# standardize the features using StandardScaler

scalar = StandardScaler()
x_train_std = scalar.fit_transform(x_train)
x_test_std = scalar.transform(x_test)

# train an SVM model on the training dataset

lin_svc = LinearSVC(random_state=100)
lin_svc.fit(x_train_std,y_train)

# predict the labels of test values

y_test_pred = lin_svc.predict(x_test_std)

# find the accuracy of the SVM model on the test set

print("-----------------------Test-----------------------")
print('\n')
print("Accuracy of SVM Model:",accuracy_score(y_test, y_test_pred))

-----------------------Test-----------------------


Accuracy of SVM Model: 0.819275




# Our model

In [33]:
x = df_binary['avg_input_features_2']
y = df_binary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

x_train = x_train.tolist()
x_test = x_test.tolist()

In [34]:
# train a Perceptron model on the training dataset

perceptron = Perceptron(n_jobs=-1, random_state=100)
perceptron.fit(x_train,y_train)

# predict the labels of test values

y_test_pred = perceptron.predict(x_test)

# find the accuracy of the Perceptron model on the test set

print("-----------------------Test-----------------------")
print('\n')
print("Accuracy of Perceptron Model:",accuracy_score(y_test, y_test_pred))

-----------------------Test-----------------------


Accuracy of Perceptron Model: 0.811125


In [35]:
# standardize the features using StandardScaler

scalar = StandardScaler()
x_train_std = scalar.fit_transform(x_train)
x_test_std = scalar.transform(x_test)

# train an SVM model on the training dataset

lin_svc = LinearSVC(random_state=100)
lin_svc.fit(x_train_std,y_train)

# predict the labels of test values

y_test_pred = lin_svc.predict(x_test_std)

# find the accuracy of the SVM model on the test set

print("-----------------------Test-----------------------")
print('\n')
print("Accuracy of SVM Model:",accuracy_score(y_test, y_test_pred))

-----------------------Test-----------------------


Accuracy of SVM Model: 0.85065




# Comments about this question

In [33]:
d = {'Model': ['Perceptron', 'SVM', 'Perceptron', 'SVM', 'Perceptron', 'SVM'], 
     'Word2Vec Features/Other Features': ['Google News', 'Google News', 'Amazon Reviews(Our)', 'Amazon Reviews(Our)', 'TF-IDF', 'TF-IDF'],
     'Accuracy': ['0.71', '0.82', '0.81', '0.85', '0.85', '0.81']}

df_results_part_3 = pd.DataFrame(data=d)
df_results_part_3

Unnamed: 0,Model,Word2Vec Features/Other Features,Accuracy
0,Perceptron,Google News,0.71
1,SVM,Google News,0.82
2,Perceptron,Amazon Reviews(Our),0.81
3,SVM,Amazon Reviews(Our),0.85
4,Perceptron,TF-IDF,0.85
5,SVM,TF-IDF,0.81


It can be seen from the above table that the TF-IDF feature types give us the best accuracy for the perceptron model, followed by the our trained Word2Vec and then Google Word2Vec. However for the SVM model, the best accuracy is given by our trained Word2Vec, follwed by Google Word2Vec and then TF-IDF. This unstable order shows us that different features work for different models the best and there is no 'one glove fits all' / 'free lunch theorem' concept in the real world. Trying out different features and then choosing what works the best for that model(good feature selection) should be our optimal solution.

# 4. Feedforward Neural Networks

In [21]:
# find out if GPU available

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [33]:
# set hyperparameters for all the models

EPOCHS = 100
BATCH_SIZE = 20
LEARNING_RATE = 0.001

In [34]:
## train data
class trainData(Dataset):
    
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.x_data)

## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

# (a)

# Binary

In [40]:
# set parameters 

input_size = 300
hidden_1_size = 50
hidden_2_size = 10
output_size = 1

In [41]:
# model for binary classification 

class binary_classification(nn.Module):
    def __init__(self):
        super(binary_classification, self).__init__()

        self.layer_1 = nn.Linear(input_size, hidden_1_size) 
        self.layer_2 = nn.Linear(hidden_1_size, hidden_2_size)
        self.layer_out = nn.Linear(hidden_2_size, output_size) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(hidden_1_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_2_size)
        
    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [42]:
# print model

model = binary_classification()
print(model)

binary_classification(
  (layer_1): Linear(in_features=300, out_features=50, bias=True)
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
  (layer_out): Linear(in_features=10, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [43]:
# define loss function and optimizer

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [44]:
# function to find the accuracy of the binary model

def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [45]:
# function to train the binary model and print results(loss & accuracy per epoch)

def train_model_binary():
    model.train()
    for e in range(1, EPOCHS+1):
        epoch_loss = 0
        epoch_acc = 0

        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch, y_batch
            optimizer.zero_grad()

            y_pred = model(x_batch)

            loss = criterion(y_pred, y_batch.unsqueeze(1))
            acc = binary_acc(y_pred, y_batch.unsqueeze(1))

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()


        print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [46]:
def test_model_binary(y_test):
    model.eval()
    
    y_pred_list = []

    with torch.no_grad():
        for x_batch in test_loader:
            y_test_pred = model(x_batch)
            y_pred_list.append(y_test_pred)
    
    y_pred_list = torch.FloatTensor(y_pred_list)
    y_test = torch.FloatTensor(y_test.tolist())

    accuracy = binary_acc(y_pred_list, y_test)
    print("Accuracy:",accuracy.item())

# Google model

In [36]:
x = df_binary['avg_input_features_1']
y = df_binary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [37]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [38]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [48]:
train_model_binary()

Epoch 001: | Loss: 0.42925 | Acc: 80.427
Epoch 002: | Loss: 0.41289 | Acc: 81.357
Epoch 003: | Loss: 0.40518 | Acc: 81.923
Epoch 004: | Loss: 0.40056 | Acc: 82.179
Epoch 005: | Loss: 0.39944 | Acc: 82.272
Epoch 006: | Loss: 0.39628 | Acc: 82.422
Epoch 007: | Loss: 0.39473 | Acc: 82.412
Epoch 008: | Loss: 0.39098 | Acc: 82.772
Epoch 009: | Loss: 0.38844 | Acc: 82.825
Epoch 010: | Loss: 0.38683 | Acc: 82.814
Epoch 011: | Loss: 0.38560 | Acc: 83.001
Epoch 012: | Loss: 0.38540 | Acc: 83.044
Epoch 013: | Loss: 0.38458 | Acc: 83.044
Epoch 014: | Loss: 0.38339 | Acc: 83.141
Epoch 015: | Loss: 0.38325 | Acc: 83.186
Epoch 016: | Loss: 0.38196 | Acc: 83.244
Epoch 017: | Loss: 0.38092 | Acc: 83.311
Epoch 018: | Loss: 0.37976 | Acc: 83.289
Epoch 019: | Loss: 0.37805 | Acc: 83.453
Epoch 020: | Loss: 0.37860 | Acc: 83.415
Epoch 021: | Loss: 0.37704 | Acc: 83.559
Epoch 022: | Loss: 0.37740 | Acc: 83.449
Epoch 023: | Loss: 0.37587 | Acc: 83.499
Epoch 024: | Loss: 0.37558 | Acc: 83.574
Epoch 025: | Los

In [40]:
test_model_binary(y_test)

Accuracy: 85.0


# Our model

In [41]:
x = df_binary['avg_input_features_2']
y = df_binary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [42]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [43]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [52]:
train_model_binary()

Epoch 001: | Loss: 0.39232 | Acc: 82.813
Epoch 002: | Loss: 0.36632 | Acc: 84.182
Epoch 003: | Loss: 0.35946 | Acc: 84.547
Epoch 004: | Loss: 0.35653 | Acc: 84.755
Epoch 005: | Loss: 0.35471 | Acc: 84.800
Epoch 006: | Loss: 0.35266 | Acc: 84.886
Epoch 007: | Loss: 0.34956 | Acc: 85.044
Epoch 008: | Loss: 0.34865 | Acc: 85.131
Epoch 009: | Loss: 0.34677 | Acc: 85.244
Epoch 010: | Loss: 0.34363 | Acc: 85.446
Epoch 011: | Loss: 0.34300 | Acc: 85.409
Epoch 012: | Loss: 0.34189 | Acc: 85.467
Epoch 013: | Loss: 0.34211 | Acc: 85.509
Epoch 014: | Loss: 0.34012 | Acc: 85.634
Epoch 015: | Loss: 0.33893 | Acc: 85.728
Epoch 016: | Loss: 0.33678 | Acc: 85.840
Epoch 017: | Loss: 0.33812 | Acc: 85.809
Epoch 018: | Loss: 0.33775 | Acc: 85.824
Epoch 019: | Loss: 0.33789 | Acc: 85.814
Epoch 020: | Loss: 0.33505 | Acc: 85.876
Epoch 021: | Loss: 0.33324 | Acc: 85.933
Epoch 022: | Loss: 0.33596 | Acc: 85.903
Epoch 023: | Loss: 0.33278 | Acc: 86.058
Epoch 024: | Loss: 0.33759 | Acc: 85.791
Epoch 025: | Los

In [45]:
test_model_binary(y_test)

Accuracy: 87.0


# Ternary

In [46]:
# ternary classification dataframe

df_ternary = df_org_3.copy(deep=True)
df_ternary

Unnamed: 0,review_body,class,avg_input_features_1,avg_input_features_2
0,send back unhappy wth quality guage s sheet us...,1,"[-0.028214889, 0.054062814, 0.022171944, 0.065...","[0.016383082, -0.11017842, 0.07979045, -0.1103..."
1,bought bottle week lid crack right rim boght p...,1,"[0.009401504, 0.04494009, -0.01879862, 0.04671...","[0.0027814035, -0.1517246, 0.039110575, -0.084..."
2,good overall instruction could use improvement...,2,"[-0.025609551, 0.035386518, -0.03870993, 0.123...","[-0.019544542, -0.09348309, 0.091074795, -0.08..."
3,beautiful color unexpectedly large,0,"[0.051719666, 0.07980347, -0.05140686, 0.07983...","[-0.0066354196, -0.07669535, 0.14650348, 0.051..."
4,puzzle review look fine bought mug use clean t...,2,"[0.0021718915, 0.036595784, -0.015984524, 0.04...","[0.049097426, -0.15116577, 0.034840178, -0.083..."
...,...,...,...,...
249995,love little skinny spatula use stovetop cookin...,0,"[0.032534514, 0.028369326, 0.016048547, 0.0922...","[0.07504659, -0.11023586, 0.102279335, -0.0310..."
249996,cheap leaky creaky sure pump handle break soon...,1,"[0.019851685, 0.074625395, -0.054214478, 0.047...","[-0.007991508, -0.23522964, 0.17086153, -0.027..."
249997,good price awesome product buy constantly rest...,0,"[0.027029855, -0.028424945, -0.025542123, 0.14...","[-0.01619439, -0.05732434, 0.008259937, -0.110..."
249998,machine little loud make great cup coffee,0,"[0.0011160715, -0.0034005302, -0.033098493, 0....","[0.0053175413, -0.09154149, 0.12706958, -0.087..."


In [83]:
# set parameters 

input_size = 300
hidden_1_size = 50
hidden_2_size = 10
output_size = 3

In [58]:
# model for ternary classification 

class ternary_classification(nn.Module):
    def __init__(self):
        super(ternary_classification, self).__init__()
        # Number of input features is 300.
        self.layer_1 = nn.Linear(input_size, hidden_1_size) 
        self.layer_2 = nn.Linear(hidden_1_size, hidden_2_size)
        self.layer_out = nn.Linear(hidden_2_size, output_size) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(hidden_1_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_2_size)
        
    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [84]:
# print model

model = ternary_classification()
print(model)

ternary_classification(
  (layer_1): Linear(in_features=300, out_features=50, bias=True)
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
  (layer_out): Linear(in_features=10, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [60]:
# define loss function and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [61]:
# function to find the accuracy of the ternary model

def ternary_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    y_pred_tags = torch.argmax(y_pred_softmax, dim = 1)
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [62]:
# function to train the ternary model and print results(loss & accuracy per epoch)

def train_model_ternary():
    model.train()
    for e in range(1, EPOCHS+1):
        epoch_loss = 0
        epoch_acc = 0

        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch, y_batch
            optimizer.zero_grad()

            y_pred = model(x_batch)

            loss = criterion(y_pred, y_batch.type(torch.LongTensor))
            acc = ternary_acc(y_pred, y_batch.type(torch.LongTensor))

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()


        print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [63]:
def test_model_ternary(y_test):
    model.eval()
    
    y_pred_list = []

    with torch.no_grad():
        for x_batch in test_loader:
            y_test_pred = model(x_batch)
            y_pred_list.extend(y_test_pred.tolist())
    
    y_pred_list = torch.FloatTensor(y_pred_list)
    y_test = torch.FloatTensor(y_test.tolist())

    accuracy = ternary_acc(y_pred_list, y_test)
    print("Accuracy:",accuracy.item())

# Google model

In [54]:
x = df_ternary['avg_input_features_1']
y = df_ternary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [55]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [56]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [63]:
train_model_ternary()

Epoch 001: | Loss: 0.81074 | Acc: 64.684
Epoch 002: | Loss: 0.79038 | Acc: 65.774
Epoch 003: | Loss: 0.78350 | Acc: 66.063
Epoch 004: | Loss: 0.77975 | Acc: 66.197
Epoch 005: | Loss: 0.77602 | Acc: 66.421
Epoch 006: | Loss: 0.77350 | Acc: 66.501
Epoch 007: | Loss: 0.77049 | Acc: 66.802
Epoch 008: | Loss: 0.76746 | Acc: 66.855
Epoch 009: | Loss: 0.76838 | Acc: 66.906
Epoch 010: | Loss: 0.76734 | Acc: 66.832
Epoch 011: | Loss: 0.76561 | Acc: 66.938
Epoch 012: | Loss: 0.76345 | Acc: 67.022
Epoch 013: | Loss: 0.76225 | Acc: 67.181
Epoch 014: | Loss: 0.76194 | Acc: 67.082
Epoch 015: | Loss: 0.76024 | Acc: 67.168
Epoch 016: | Loss: 0.76047 | Acc: 67.124
Epoch 017: | Loss: 0.75881 | Acc: 67.260
Epoch 018: | Loss: 0.75817 | Acc: 67.221
Epoch 019: | Loss: 0.75690 | Acc: 67.308
Epoch 020: | Loss: 0.75645 | Acc: 67.327
Epoch 021: | Loss: 0.75610 | Acc: 67.394
Epoch 022: | Loss: 0.75603 | Acc: 67.302
Epoch 023: | Loss: 0.75522 | Acc: 67.371
Epoch 024: | Loss: 0.75484 | Acc: 67.377
Epoch 025: | Los

In [73]:
test_model_ternary(y_test)

Accuracy: 68.0


# Our model

In [74]:
x = df_ternary['avg_input_features_2']
y = df_ternary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [75]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [76]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [67]:
train_model_ternary()

Epoch 001: | Loss: 0.77657 | Acc: 66.731
Epoch 002: | Loss: 0.74225 | Acc: 68.403
Epoch 003: | Loss: 0.73529 | Acc: 68.714
Epoch 004: | Loss: 0.73158 | Acc: 68.862
Epoch 005: | Loss: 0.72965 | Acc: 69.052
Epoch 006: | Loss: 0.72626 | Acc: 69.173
Epoch 007: | Loss: 0.72531 | Acc: 69.286
Epoch 008: | Loss: 0.72245 | Acc: 69.313
Epoch 009: | Loss: 0.72162 | Acc: 69.436
Epoch 010: | Loss: 0.72103 | Acc: 69.347
Epoch 011: | Loss: 0.71931 | Acc: 69.397
Epoch 012: | Loss: 0.71833 | Acc: 69.371
Epoch 013: | Loss: 0.71758 | Acc: 69.464
Epoch 014: | Loss: 0.71764 | Acc: 69.540
Epoch 015: | Loss: 0.71575 | Acc: 69.572
Epoch 016: | Loss: 0.71370 | Acc: 69.653
Epoch 017: | Loss: 0.71389 | Acc: 69.635
Epoch 018: | Loss: 0.71227 | Acc: 69.692
Epoch 019: | Loss: 0.71149 | Acc: 69.790
Epoch 020: | Loss: 0.71154 | Acc: 69.733
Epoch 021: | Loss: 0.71211 | Acc: 69.715
Epoch 022: | Loss: 0.70957 | Acc: 69.901
Epoch 023: | Loss: 0.70945 | Acc: 69.834
Epoch 024: | Loss: 0.70905 | Acc: 69.852
Epoch 025: | Los

In [78]:
test_model_ternary(y_test)

Accuracy: 71.0


# Comments about this question

In [34]:
d = {'Model': ['FNN', 'FNN', 'FNN', 'FNN'], 
     'Word2Vec Model': ['Google News', 'Amazon Reviews(Our)', 'Google News', 'Amazon Reviews(Our)'],
     'Classification Type': ['Binary', 'Binary', 'Ternary', 'Ternary',],
     'Input Features Type': ['Average' , 'Average', 'Average', 'Average'],
     'Accuracy': ['0.85', '0.87', '0.68', '0.71']}

df_results_part_4_a = pd.DataFrame(data=d)
df_results_part_4_a

Unnamed: 0,Model,Word2Vec Model,Classification Type,Input Features Type,Accuracy
0,FNN,Google News,Binary,Average,0.85
1,FNN,Amazon Reviews(Our),Binary,Average,0.87
2,FNN,Google News,Ternary,Average,0.68
3,FNN,Amazon Reviews(Our),Ternary,Average,0.71


# (b)

In [26]:
# function to pad a list with a specific number of zeroes

def pad_or_truncate(some_list, target_len):
    return some_list[:target_len] + [0]*(target_len - len(some_list))

In [27]:
# function to concatenate vectors of first ten words as your input feature 

def concatenate_vectors(review,model_used):

    sentence_words = review.split(" ")
    
    sentence_vectors = []    
    
    for i,word in enumerate(sentence_words):
        if i < 10:
            try:
                sentence_vectors.append(model_used[word])
            except:
                continue
    
    flattened_sentence_vector = np.array(sentence_vectors).flatten()
    
    if len(sentence_vectors)!=0:
        if len(flattened_sentence_vector) != 3000:
            flattened_sentence_vector = pad_or_truncate(list(flattened_sentence_vector),3000)
                
        return flattened_sentence_vector

    else:
        return np.zeros(3000,)

In [28]:
# find input feature for google model

df_org_3['concat_input_features_1'] = df_org_3['review_body'].apply(lambda x: concatenate_vectors(x,wv))
df_org_3    

Unnamed: 0,review_body,class,avg_input_features_1,avg_input_features_2,concat_input_features_1
0,assume four charger bought item pretty bought ...,1,"[0.04277208, -0.03597005, -0.062435575, 0.1046...","[0.017703589, -0.11186184, -0.0030522645, -0.0...","[0.06640625, -0.103027344, -0.08251953, 0.1079..."
1,son like cook he especially good grill burger ...,0,"[-0.004893621, 0.029286703, -0.01199023, 0.162...","[0.120273024, -0.14361034, 0.046780374, -0.138...","[0.107910156, -0.030029297, 0.033203125, -0.16..."
2,ship fast good price way huger expect,0,"[0.1432408, 0.08569336, -0.048673358, 0.078264...","[-0.049596105, -0.018341891, 0.13302507, -0.17...","[0.27929688, 0.29101562, -0.21386719, -0.14648..."
3,container great lid thin break easily one use,2,"[0.056274414, 0.10064697, -0.0005340576, 0.056...","[0.030435072, -0.15327847, 0.11309578, -0.1425...","[0.048095703, 0.31640625, 0.17773438, -0.06982..."
4,item receive broken return ask replacement shi...,1,"[0.043584187, -0.013412476, -0.116475426, 0.06...","[0.08915458, -0.22801971, -0.028520422, -0.263...","[0.024291992, 0.010803223, -0.107421875, 0.302..."
...,...,...,...,...,...
249995,lock come easily hard clean top,2,"[0.03120931, 0.07987467, 0.03741455, 0.0357869...","[0.015699785, -0.12990652, 0.21889718, -0.1027...","[0.017944336, 0.19335938, -0.06298828, 0.02429..."
249996,bum carafe slightly wide bit short metal struc...,1,"[-0.001551011, 0.026309744, -0.06418026, 0.125...","[0.015504825, -0.031771064, 0.1092756, -0.0557...","[0.10546875, -0.20117188, -0.13964844, 0.32226..."
249997,I kettle one month leak water leak seal bottom...,1,"[0.0027923584, 0.092679344, -0.03684489, 0.028...","[0.020719932, -0.090553395, 0.13070571, -0.027...","[0.07910156, -0.0050354004, 0.111816406, 0.212..."
249998,idea color balloon entice order package child ...,1,"[0.047094908, 0.011726828, 0.00012925093, 0.09...","[0.066825956, -0.17564225, 0.05628306, -0.0763...","[0.067871094, 0.011657715, 0.033691406, 0.2207..."


In [29]:
# find input feature for our model

df_org_3['concat_input_features_2'] = df_org_3['review_body'].apply(lambda x: concatenate_vectors(x,final_model))
df_org_3    

  sentence_vectors.append(model_used[word])


Unnamed: 0,review_body,class,avg_input_features_1,avg_input_features_2,concat_input_features_1,concat_input_features_2
0,assume four charger bought item pretty bought ...,1,"[0.04277208, -0.03597005, -0.062435575, 0.1046...","[0.017703589, -0.11186184, -0.0030522645, -0.0...","[0.06640625, -0.103027344, -0.08251953, 0.1079...","[0.18149155, -0.23886244, -0.0827184, 0.060127..."
1,son like cook he especially good grill burger ...,0,"[-0.004893621, 0.029286703, -0.01199023, 0.162...","[0.120273024, -0.14361034, 0.046780374, -0.138...","[0.107910156, -0.030029297, 0.033203125, -0.16...","[0.39106262, -0.43970776, -0.014117015, 0.1198..."
2,ship fast good price way huger expect,0,"[0.1432408, 0.08569336, -0.048673358, 0.078264...","[-0.049596105, -0.018341891, 0.13302507, -0.17...","[0.27929688, 0.29101562, -0.21386719, -0.14648...","[-0.07464662, -0.21261097, -0.26036084, -0.465..."
3,container great lid thin break easily one use,2,"[0.056274414, 0.10064697, -0.0005340576, 0.056...","[0.030435072, -0.15327847, 0.11309578, -0.1425...","[0.048095703, 0.31640625, 0.17773438, -0.06982...","[-0.044359308, -0.092595585, 0.07619203, -0.14..."
4,item receive broken return ask replacement shi...,1,"[0.043584187, -0.013412476, -0.116475426, 0.06...","[0.08915458, -0.22801971, -0.028520422, -0.263...","[0.024291992, 0.010803223, -0.107421875, 0.302...","[0.102800496, -0.12086469, -0.14640297, 0.0537..."
...,...,...,...,...,...,...
249995,lock come easily hard clean top,2,"[0.03120931, 0.07987467, 0.03741455, 0.0357869...","[0.015699785, -0.12990652, 0.21889718, -0.1027...","[0.017944336, 0.19335938, -0.06298828, 0.02429...","[0.24208477, -0.24096622, 0.30787, -0.2916415,..."
249996,bum carafe slightly wide bit short metal struc...,1,"[-0.001551011, 0.026309744, -0.06418026, 0.125...","[0.015504825, -0.031771064, 0.1092756, -0.0557...","[0.10546875, -0.20117188, -0.13964844, 0.32226...","[0.14765103, -0.15398727, 0.014575721, -0.1541..."
249997,I kettle one month leak water leak seal bottom...,1,"[0.0027923584, 0.092679344, -0.03684489, 0.028...","[0.020719932, -0.090553395, 0.13070571, -0.027...","[0.07910156, -0.0050354004, 0.111816406, 0.212...","[0.2060052, -0.18501587, -0.0031185225, -0.029..."
249998,idea color balloon entice order package child ...,1,"[0.047094908, 0.011726828, 0.00012925093, 0.09...","[0.066825956, -0.17564225, 0.05628306, -0.0763...","[0.067871094, 0.011657715, 0.033691406, 0.2207...","[0.07186723, -0.11819719, -0.024285497, -0.130..."


# Binary

In [30]:
# binary classification dataframe

df_binary = df_org_3[((df_org_3['class'] == 0) | (df_org_3['class'] == 1))]
df_binary

Unnamed: 0,review_body,class,avg_input_features_1,avg_input_features_2,concat_input_features_1,concat_input_features_2
0,assume four charger bought item pretty bought ...,1,"[0.04277208, -0.03597005, -0.062435575, 0.1046...","[0.017703589, -0.11186184, -0.0030522645, -0.0...","[0.06640625, -0.103027344, -0.08251953, 0.1079...","[0.18149155, -0.23886244, -0.0827184, 0.060127..."
1,son like cook he especially good grill burger ...,0,"[-0.004893621, 0.029286703, -0.01199023, 0.162...","[0.120273024, -0.14361034, 0.046780374, -0.138...","[0.107910156, -0.030029297, 0.033203125, -0.16...","[0.39106262, -0.43970776, -0.014117015, 0.1198..."
2,ship fast good price way huger expect,0,"[0.1432408, 0.08569336, -0.048673358, 0.078264...","[-0.049596105, -0.018341891, 0.13302507, -0.17...","[0.27929688, 0.29101562, -0.21386719, -0.14648...","[-0.07464662, -0.21261097, -0.26036084, -0.465..."
4,item receive broken return ask replacement shi...,1,"[0.043584187, -0.013412476, -0.116475426, 0.06...","[0.08915458, -0.22801971, -0.028520422, -0.263...","[0.024291992, 0.010803223, -0.107421875, 0.302...","[0.102800496, -0.12086469, -0.14640297, 0.0537..."
5,experience issue one cup fill make sure filter...,0,"[0.0077209473, -0.015841166, -0.04876624, 0.11...","[0.0042549637, -0.026836593, 0.14918885, -0.08...","[0.037841797, -0.060058594, -0.05810547, -0.15...","[0.15096039, 0.03984432, 0.08405365, -0.053545..."
...,...,...,...,...,...,...
249993,toaster oven fine especially since paid amazon...,1,"[0.03401947, 0.05153087, -0.0007176717, 0.0253...","[0.050901376, -0.11194899, 0.12081799, -0.0080...","[0.14453125, -0.07421875, -0.043945312, 0.2382...","[0.28356823, 0.13480736, -0.103595145, 0.34340..."
249996,bum carafe slightly wide bit short metal struc...,1,"[-0.001551011, 0.026309744, -0.06418026, 0.125...","[0.015504825, -0.031771064, 0.1092756, -0.0557...","[0.10546875, -0.20117188, -0.13964844, 0.32226...","[0.14765103, -0.15398727, 0.014575721, -0.1541..."
249997,I kettle one month leak water leak seal bottom...,1,"[0.0027923584, 0.092679344, -0.03684489, 0.028...","[0.020719932, -0.090553395, 0.13070571, -0.027...","[0.07910156, -0.0050354004, 0.111816406, 0.212...","[0.2060052, -0.18501587, -0.0031185225, -0.029..."
249998,idea color balloon entice order package child ...,1,"[0.047094908, 0.011726828, 0.00012925093, 0.09...","[0.066825956, -0.17564225, 0.05628306, -0.0763...","[0.067871094, 0.011657715, 0.033691406, 0.2207...","[0.07186723, -0.11819719, -0.024285497, -0.130..."


In [47]:
# set parameters 

input_size = 3000
hidden_1_size = 50
hidden_2_size = 10
output_size = 1

# Google model 

In [85]:
x = df_binary['concat_input_features_1']
y = df_binary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [86]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [87]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [48]:
# print model

model = binary_classification()
print(model)

binary_classification(
  (layer_1): Linear(in_features=3000, out_features=50, bias=True)
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
  (layer_out): Linear(in_features=10, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [49]:
# define loss function and optimizer

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [79]:
train_model_binary()

Epoch 001: | Loss: 0.51775 | Acc: 74.389
Epoch 002: | Loss: 0.48779 | Acc: 76.416
Epoch 003: | Loss: 0.46733 | Acc: 77.754
Epoch 004: | Loss: 0.45035 | Acc: 78.726
Epoch 005: | Loss: 0.43386 | Acc: 79.877
Epoch 006: | Loss: 0.41914 | Acc: 80.656
Epoch 007: | Loss: 0.40263 | Acc: 81.550
Epoch 008: | Loss: 0.39206 | Acc: 82.135
Epoch 009: | Loss: 0.37960 | Acc: 82.766
Epoch 010: | Loss: 0.36853 | Acc: 83.453
Epoch 011: | Loss: 0.35869 | Acc: 83.838
Epoch 012: | Loss: 0.34956 | Acc: 84.293
Epoch 013: | Loss: 0.34203 | Acc: 84.739
Epoch 014: | Loss: 0.33421 | Acc: 85.152
Epoch 015: | Loss: 0.32706 | Acc: 85.552
Epoch 016: | Loss: 0.31937 | Acc: 85.869
Epoch 017: | Loss: 0.31509 | Acc: 86.163
Epoch 018: | Loss: 0.30826 | Acc: 86.475
Epoch 019: | Loss: 0.30178 | Acc: 86.912
Epoch 020: | Loss: 0.29440 | Acc: 87.188
Epoch 021: | Loss: 0.29068 | Acc: 87.469
Epoch 022: | Loss: 0.28397 | Acc: 87.698
Epoch 023: | Loss: 0.27952 | Acc: 88.007
Epoch 024: | Loss: 0.27510 | Acc: 88.255
Epoch 025: | Los

In [91]:
test_model_binary(y_test)

Accuracy: 73.0


# Our model

In [31]:
x = df_binary['concat_input_features_2']
y = df_binary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [35]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [36]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [83]:
train_model_binary()

Epoch 001: | Loss: 0.52392 | Acc: 74.279
Epoch 002: | Loss: 0.46477 | Acc: 77.858
Epoch 003: | Loss: 0.45286 | Acc: 78.440
Epoch 004: | Loss: 0.44242 | Acc: 79.154
Epoch 005: | Loss: 0.42896 | Acc: 79.806
Epoch 006: | Loss: 0.41631 | Acc: 80.604
Epoch 007: | Loss: 0.40414 | Acc: 81.266
Epoch 008: | Loss: 0.39191 | Acc: 81.942
Epoch 009: | Loss: 0.37769 | Acc: 82.774
Epoch 010: | Loss: 0.36850 | Acc: 83.203
Epoch 011: | Loss: 0.35949 | Acc: 83.771
Epoch 012: | Loss: 0.34855 | Acc: 84.371
Epoch 013: | Loss: 0.33820 | Acc: 84.924
Epoch 014: | Loss: 0.33224 | Acc: 85.205
Epoch 015: | Loss: 0.32233 | Acc: 85.711
Epoch 016: | Loss: 0.31424 | Acc: 86.075
Epoch 017: | Loss: 0.30655 | Acc: 86.567
Epoch 018: | Loss: 0.30080 | Acc: 86.804
Epoch 019: | Loss: 0.29432 | Acc: 87.156
Epoch 020: | Loss: 0.28855 | Acc: 87.446
Epoch 021: | Loss: 0.28392 | Acc: 87.642
Epoch 022: | Loss: 0.27764 | Acc: 87.976
Epoch 023: | Loss: 0.27157 | Acc: 88.361
Epoch 024: | Loss: 0.26821 | Acc: 88.508
Epoch 025: | Los

In [51]:
test_model_binary(y_test)

Accuracy: 75.0


# Ternary

In [52]:
# ternary classification dataframe

df_ternary = df_org_3.copy(deep=True)
df_ternary

Unnamed: 0,review_body,class,avg_input_features_1,avg_input_features_2,concat_input_features_1,concat_input_features_2
0,assume four charger bought item pretty bought ...,1,"[0.04277208, -0.03597005, -0.062435575, 0.1046...","[0.017703589, -0.11186184, -0.0030522645, -0.0...","[0.06640625, -0.103027344, -0.08251953, 0.1079...","[0.18149155, -0.23886244, -0.0827184, 0.060127..."
1,son like cook he especially good grill burger ...,0,"[-0.004893621, 0.029286703, -0.01199023, 0.162...","[0.120273024, -0.14361034, 0.046780374, -0.138...","[0.107910156, -0.030029297, 0.033203125, -0.16...","[0.39106262, -0.43970776, -0.014117015, 0.1198..."
2,ship fast good price way huger expect,0,"[0.1432408, 0.08569336, -0.048673358, 0.078264...","[-0.049596105, -0.018341891, 0.13302507, -0.17...","[0.27929688, 0.29101562, -0.21386719, -0.14648...","[-0.07464662, -0.21261097, -0.26036084, -0.465..."
3,container great lid thin break easily one use,2,"[0.056274414, 0.10064697, -0.0005340576, 0.056...","[0.030435072, -0.15327847, 0.11309578, -0.1425...","[0.048095703, 0.31640625, 0.17773438, -0.06982...","[-0.044359308, -0.092595585, 0.07619203, -0.14..."
4,item receive broken return ask replacement shi...,1,"[0.043584187, -0.013412476, -0.116475426, 0.06...","[0.08915458, -0.22801971, -0.028520422, -0.263...","[0.024291992, 0.010803223, -0.107421875, 0.302...","[0.102800496, -0.12086469, -0.14640297, 0.0537..."
...,...,...,...,...,...,...
249995,lock come easily hard clean top,2,"[0.03120931, 0.07987467, 0.03741455, 0.0357869...","[0.015699785, -0.12990652, 0.21889718, -0.1027...","[0.017944336, 0.19335938, -0.06298828, 0.02429...","[0.24208477, -0.24096622, 0.30787, -0.2916415,..."
249996,bum carafe slightly wide bit short metal struc...,1,"[-0.001551011, 0.026309744, -0.06418026, 0.125...","[0.015504825, -0.031771064, 0.1092756, -0.0557...","[0.10546875, -0.20117188, -0.13964844, 0.32226...","[0.14765103, -0.15398727, 0.014575721, -0.1541..."
249997,I kettle one month leak water leak seal bottom...,1,"[0.0027923584, 0.092679344, -0.03684489, 0.028...","[0.020719932, -0.090553395, 0.13070571, -0.027...","[0.07910156, -0.0050354004, 0.111816406, 0.212...","[0.2060052, -0.18501587, -0.0031185225, -0.029..."
249998,idea color balloon entice order package child ...,1,"[0.047094908, 0.011726828, 0.00012925093, 0.09...","[0.066825956, -0.17564225, 0.05628306, -0.0763...","[0.067871094, 0.011657715, 0.033691406, 0.2207...","[0.07186723, -0.11819719, -0.024285497, -0.130..."


In [68]:
# set parameters 

input_size = 3000
hidden_1_size = 50
hidden_2_size = 10
output_size = 3

# Google model

In [69]:
x = df_ternary['concat_input_features_1']
y = df_ternary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [70]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [71]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [72]:
# print model

model = ternary_classification()
print(model)

ternary_classification(
  (layer_1): Linear(in_features=3000, out_features=50, bias=True)
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
  (layer_out): Linear(in_features=10, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [73]:
# define loss function and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [91]:
train_model_ternary()

Epoch 001: | Loss: 0.89409 | Acc: 59.612
Epoch 002: | Loss: 0.86054 | Acc: 61.697
Epoch 003: | Loss: 0.84039 | Acc: 62.864
Epoch 004: | Loss: 0.82043 | Acc: 64.089
Epoch 005: | Loss: 0.80407 | Acc: 64.951
Epoch 006: | Loss: 0.78953 | Acc: 65.698
Epoch 007: | Loss: 0.77770 | Acc: 66.365
Epoch 008: | Loss: 0.76451 | Acc: 67.052
Epoch 009: | Loss: 0.75307 | Acc: 67.719
Epoch 010: | Loss: 0.73976 | Acc: 68.380
Epoch 011: | Loss: 0.73238 | Acc: 68.749
Epoch 012: | Loss: 0.72255 | Acc: 69.247
Epoch 013: | Loss: 0.71186 | Acc: 69.825
Epoch 014: | Loss: 0.70388 | Acc: 70.320
Epoch 015: | Loss: 0.69564 | Acc: 70.564
Epoch 016: | Loss: 0.68788 | Acc: 71.011
Epoch 017: | Loss: 0.68257 | Acc: 71.261
Epoch 018: | Loss: 0.67523 | Acc: 71.591
Epoch 019: | Loss: 0.66889 | Acc: 71.825
Epoch 020: | Loss: 0.66310 | Acc: 72.161
Epoch 021: | Loss: 0.65745 | Acc: 72.427
Epoch 022: | Loss: 0.65151 | Acc: 72.799
Epoch 023: | Loss: 0.64737 | Acc: 72.997
Epoch 024: | Loss: 0.64228 | Acc: 73.146
Epoch 025: | Los

In [75]:
test_model_ternary(y_test)

Accuracy: 57.0


# Our model

In [76]:
x = df_ternary['concat_input_features_2']
y = df_ternary['class']

# Split the dataset into 80% training dataset and 20% testing dataset

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=100)

In [77]:
## train data
train_data = trainData(torch.FloatTensor(x_train.tolist()), 
                       torch.FloatTensor(y_train))
## test data    
test_data = testData(torch.FloatTensor(x_test.tolist()))

In [78]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [95]:
train_model_ternary()

Epoch 001: | Loss: 0.90813 | Acc: 59.358
Epoch 002: | Loss: 0.84278 | Acc: 62.880
Epoch 003: | Loss: 0.82552 | Acc: 63.622
Epoch 004: | Loss: 0.81350 | Acc: 64.262
Epoch 005: | Loss: 0.80305 | Acc: 64.778
Epoch 006: | Loss: 0.79276 | Acc: 65.254
Epoch 007: | Loss: 0.78232 | Acc: 65.812
Epoch 008: | Loss: 0.77178 | Acc: 66.382
Epoch 009: | Loss: 0.76321 | Acc: 66.742
Epoch 010: | Loss: 0.75476 | Acc: 67.319
Epoch 011: | Loss: 0.74597 | Acc: 67.665
Epoch 012: | Loss: 0.73657 | Acc: 68.159
Epoch 013: | Loss: 0.72833 | Acc: 68.512
Epoch 014: | Loss: 0.72178 | Acc: 68.912
Epoch 015: | Loss: 0.71326 | Acc: 69.347
Epoch 016: | Loss: 0.70599 | Acc: 69.620
Epoch 017: | Loss: 0.69941 | Acc: 70.033
Epoch 018: | Loss: 0.69192 | Acc: 70.278
Epoch 019: | Loss: 0.68419 | Acc: 70.690
Epoch 020: | Loss: 0.67823 | Acc: 71.061
Epoch 021: | Loss: 0.67399 | Acc: 71.223
Epoch 022: | Loss: 0.66797 | Acc: 71.364
Epoch 023: | Loss: 0.66216 | Acc: 71.787
Epoch 024: | Loss: 0.65866 | Acc: 71.964
Epoch 025: | Los

In [80]:
test_model_ternary(y_test)

Accuracy: 59.0


# Comments about this question

In [35]:
d = {'Model': ['FNN', 'FNN', 'FNN', 'FNN'], 
     'Word2Vec Model': ['Google News', 'Amazon Reviews(Our)', 'Google News', 'Amazon Reviews(Our)'],
     'Classification Type': ['Binary', 'Binary', 'Ternary', 'Ternary',],
     'Input Features Type': ['Concat_first_10' , 'Concat_first_10', 'Concat_first_10', 'Concat_first_10'],
     'Accuracy': ['0.73', '0.75', '0.57', '0.59']}

df_results_part_4_b = pd.DataFrame(data=d)
df_results_part_4_b

Unnamed: 0,Model,Word2Vec Model,Classification Type,Input Features Type,Accuracy
0,FNN,Google News,Binary,Concat_first_10,0.73
1,FNN,Amazon Reviews(Our),Binary,Concat_first_10,0.75
2,FNN,Google News,Ternary,Concat_first_10,0.57
3,FNN,Amazon Reviews(Our),Ternary,Concat_first_10,0.59


# Comments

In [38]:
df_results_part_3

Unnamed: 0,Model,Word2Vec Features/Other Features,Accuracy
0,Perceptron,Google News,0.71
1,SVM,Google News,0.82
2,Perceptron,Amazon Reviews(Our),0.81
3,SVM,Amazon Reviews(Our),0.85
4,Perceptron,TF-IDF,0.85
5,SVM,TF-IDF,0.81


In [36]:
df_results_part_4_a

Unnamed: 0,Model,Word2Vec Model,Classification Type,Input Features Type,Accuracy
0,FNN,Google News,Binary,Average,0.85
1,FNN,Amazon Reviews(Our),Binary,Average,0.87
2,FNN,Google News,Ternary,Average,0.68
3,FNN,Amazon Reviews(Our),Ternary,Average,0.71


In [37]:
df_results_part_4_b

Unnamed: 0,Model,Word2Vec Model,Classification Type,Input Features Type,Accuracy
0,FNN,Google News,Binary,Concat_first_10,0.73
1,FNN,Amazon Reviews(Our),Binary,Concat_first_10,0.75
2,FNN,Google News,Ternary,Concat_first_10,0.57
3,FNN,Amazon Reviews(Our),Ternary,Concat_first_10,0.59


It can be seen from the above tables that for binary classification(as mentioned in the question pdf note), the FNN model(input features - Average Word2Vec vectors) works better or comparable(in some cases) than both the Perceptron and the SVM model for Google News/Amazon Reviews(Our)/TF-IDF Word2Vec features. However the FNN model(input features - Concat(first 10) vectors) performs poorly than both the Perceptron and the SVM model for Google News/Amaxon Reviews(Our)/TF-IDF Word2Vec features.
This shows that the average vectors is a better input feature type selection here than concatenating the first 10 vectors. Also the feedforward MLP model is stronger and slightly more accurate here at binary classification if average vectors are considered. This is so since we get a lot of hyperparameter and design paramter tuning flexibility in Neural Network models(epochs,batch_size,learning_rate,activation functions(linear/non-linear:relu),loss,optimizer,etc.) that can help us achieve possibly a higher accuracy.