In [114]:
import numpy as np
import pandas as pd

import re

import warnings
warnings.simplefilter('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,VotingClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier,BaggingClassifier,ExtraTreesClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

import xgboost as xgb
import lightgbm as lgb


<h3>Importing data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
game_overview = pd.read_csv('game_overview.csv')

In [3]:
print(train.shape)
print(test.shape)
print(game_overview.shape)

(17494, 5)
(8045, 4)
(64, 5)


In [4]:
train.head(2)

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1


In [5]:
game_overview.head(2) 

Unnamed: 0,title,developer,publisher,tags,overview
0,Spooky's Jump Scare Mansion,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,Sakura Clicker,Winged Cloud,Winged Cloud,"['Nudity', 'Anime', 'Free to Play', 'Mature', ...",The latest entry in the Sakura series is more ...


In [6]:
test.head(2)

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...


In [7]:
#converting year from float to string
#filling nan values and 2014(by chance)

train['year'] = train['year'].fillna(2014.0).astype(int).astype(str)
test['year'] = test['year'].fillna(2014.0).astype(int).astype(str)

In [8]:
train['title'].nunique(), test['title'].nunique()

(44, 20)

In [9]:
#checking out number of common games in train and test list

game_tr = train['title'].unique().tolist()
game_te = test['title'].unique().tolist()

common = [game for game in game_te if game in game_tr]
print("there are {} common titles between train and test dataset".format(len(common)))

there are 0 common titles between train and test dataset


<h3> Preprocessing developer, publisher and title of game_overview

In [10]:
dev = [e.strip().replace(',','').replace('.','').replace('-','_').replace(' ','_').lower() for e in game_overview['developer'].values]
pub = [e.strip().replace(',','').replace('.','').replace('-','_').replace(' ','_').lower() for e in game_overview['publisher'].values]

game_overview.drop(['developer','publisher'], axis=1, inplace=True)

game_overview[['developer']] = pd.DataFrame(dev)
game_overview[['publisher']] = pd.DataFrame(pub)

In [11]:
def preprocess_title(data):
    preprocessed_title=[]
    for row in data:
        row = re.sub(r"\'s", " is", row)
        row = re.sub('[^A-Za-z0-9]+', ' ', row)
        row = row.replace('&','')
        row = row.replace(',','') 
        row = row.replace(' ','_')
        row = row.replace('-','_')
        row = row.replace('!','_')

        preprocessed_title.append(row.lower().strip())
        
    return preprocessed_title

In [12]:
train['preprocessed_title'] = preprocess_title(train['title'].values)
test['preprocessed_title'] = preprocess_title(test['title'].values)
game_overview['preprocessed_title'] = preprocess_title(game_overview['title'].values)

In [13]:
train.drop(['title'], axis=1, inplace=True)
test.drop(['title'], axis=1, inplace=True)
game_overview.drop(['title'], axis=1, inplace=True)

In [14]:
game_overview.head(2)

Unnamed: 0,tags,overview,developer,publisher,preprocessed_title
0,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...,lag_studios,lag_studios,spooky_is_jump_scare_mansion
1,"['Nudity', 'Anime', 'Free to Play', 'Mature', ...",The latest entry in the Sakura series is more ...,winged_cloud,winged_cloud,sakura_clicker


<h3> Handling tags

In [15]:
print(game_overview['tags'][0])
print(game_overview['tags'][1])

['Horror', 'Free to Play', 'Cute', 'First-Person', 'Singleplayer', 'Psychological Horror', 'Indie', 'Adventure', 'Dark', 'Funny', 'Atmospheric', 'Action', 'Walking Simulator', 'Survival', 'Survival Horror', 'Anime', 'Gore', 'Comedy', 'Multiplayer', 'Illuminati']
['Nudity', 'Anime', 'Free to Play', 'Mature', 'Sexual Content', 'Clicker', 'Female Protagonist', 'Singleplayer', 'Casual', 'Indie', 'Fantasy', 'NSFW', 'Memes', 'Funny', '2D', 'RPG', 'Story Rich', 'Adventure', 'Dating Sim', 'Illuminati']


In [16]:
#extracting list from the string

from ast import literal_eval

game_overview['tags'] = game_overview['tags'].apply(literal_eval)

In [17]:
#game_overview['tags'].apply(len).value_counts()

In [18]:
#extracting all tags 
final=[]
for l in game_overview['tags']:
    final.extend(l)
tags = list(set(final))
print("no of tags:",len(tags))


no of tags: 161


In [19]:
l=[]
for row in game_overview['tags']:
    b=np.zeros(len(tags))
    for ele in row:
        b[tags.index(ele)] =1 
    l.append(b)

In [20]:
tags_df = pd.DataFrame(l, columns=tags)

In [21]:
tags_df.head(2)

Unnamed: 0,Team-Based,Sports,Local Co-Op,Turn-Based Strategy,MMORPG,Fast-Paced,City Builder,Dungeons & Dragons,Sandbox,Mod,...,Family Friendly,Exploration,Zombies,Dating Sim,Multiplayer,Flight,Masterpiece,Dragons,2D Fighter,NSFW
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
print(game_overview.shape)
print(tags_df.shape)

(64, 5)
(64, 161)


In [23]:
game_overview = pd.concat([game_overview,tags_df],axis=1)

In [24]:
game_overview['no_of_tags'] = game_overview['tags'].apply(len).tolist()
game_overview.drop('tags', axis=1, inplace=True)

In [25]:
game_overview.head(2)

Unnamed: 0,overview,developer,publisher,preprocessed_title,Team-Based,Sports,Local Co-Op,Turn-Based Strategy,MMORPG,Fast-Paced,...,Exploration,Zombies,Dating Sim,Multiplayer,Flight,Masterpiece,Dragons,2D Fighter,NSFW,no_of_tags
0,Can you survive 1000 rooms of cute terror? Or ...,lag_studios,lag_studios,spooky_is_jump_scare_mansion,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,20
1,The latest entry in the Sakura series is more ...,winged_cloud,winged_cloud,sakura_clicker,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,20


In [26]:
train.head(2)

Unnamed: 0,review_id,year,user_review,user_suggestion,preprocessed_title
0,1,2016,I'm scared and hearing creepy voices. So I'll...,1,spooky_is_jump_scare_mansion
1,2,2016,"Best game, more better than Sam Pepper's YouTu...",1,spooky_is_jump_scare_mansion


In [27]:
data_tr = train.merge(game_overview, on ='preprocessed_title')
data_te = test.merge(game_overview, on ='preprocessed_title')
print(data_tr.shape)
print(data_te.shape)
#data.head(2)

(17494, 170)
(8045, 169)


<h3> Pre-processing

In [28]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [29]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

<h5> train data

In [30]:
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(data_tr['user_review'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sent.lower().strip())

data_tr['preprocessed_reviews'] = preprocessed_reviews
data_tr.drop(['user_review'],axis=1,inplace=True ) 

100%|██████████| 17494/17494 [00:12<00:00, 1374.48it/s]


In [31]:
from tqdm import tqdm
preprocessed_overview = []
# tqdm is for printing the status bar
for sentance in tqdm(data_tr['overview'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ') 
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
    preprocessed_overview.append(sent.lower().strip())
    
    
data_tr['preprocessed_overview'] = preprocessed_overview
data_tr.drop(['overview'],axis=1,inplace=True ) 

100%|██████████| 17494/17494 [00:18<00:00, 968.39it/s] 


In [32]:
#finding #words in preprocessed_reviews
l=[]
for sent in preprocessed_reviews:
    count=0
    for word in sent:
        count+=1
    l.append(count)
    
data_tr[['#words_in_review']] = pd.DataFrame(l)

In [33]:
#finding #words in preprocessed_overviews
l=[]
for sent in preprocessed_overview:
    count=0
    for word in sent:
        count+=1
    l.append(count)
    
data_tr[['#words_in_overview']] = pd.DataFrame(l)

<h4>test data

In [34]:
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(data_te['user_review'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sent.lower().strip())

data_te['preprocessed_reviews'] = preprocessed_reviews
data_te.drop(['user_review'],axis=1,inplace=True ) 

100%|██████████| 8045/8045 [00:03<00:00, 2349.98it/s]


In [35]:
from tqdm import tqdm
preprocessed_overview = []
# tqdm is for printing the status bar
for sentance in tqdm(data_te['overview'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ') 
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
    preprocessed_overview.append(sent.lower().strip())
    
    
data_te['preprocessed_overview'] = preprocessed_overview
data_te.drop(['overview'],axis=1,inplace=True ) 

100%|██████████| 8045/8045 [00:06<00:00, 1296.90it/s]


In [36]:
#finding #words in preprocessed_reviews
l=[]
for sent in preprocessed_reviews:
    count=0
    for word in sent:
        count+=1
    l.append(count)
    
data_te[['#words_in_review']] = pd.DataFrame(l)

In [37]:
#finding #words in preprocessed_overviews
l=[]
for sent in preprocessed_overview:
    count=0
    for word in sent:
        count+=1
    l.append(count)
    
data_te[['#words_in_overview']] = pd.DataFrame(l)

In [38]:
#data_tr.head(2)

<h3>SentimentIntensityAnalyzer

In [39]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [40]:
nltk.downloader.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/namitagarwal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [41]:
#citation: https://www.programcreek.com/python/example/100005/nltk.sentiment.vader.SentimentIntensityAnalyzer

sentiments=[]
sid = SentimentIntensityAnalyzer()

for review in tqdm(data_tr['preprocessed_reviews']):
    sentiment = sid.polarity_scores(review)
        
    for k in sorted(sentiment):
        sentiments.append([sentiment['neg'], sentiment['pos'],
                           sentiment['neu'], sentiment['compound']]) 
        
data_tr[['neg', 'pos', 'neu', 'compound']] = pd.DataFrame(sentiments)

100%|██████████| 17494/17494 [00:22<00:00, 770.07it/s] 


In [42]:
sentiments=[]
sid = SentimentIntensityAnalyzer()

for review in tqdm(data_te['preprocessed_reviews']):
    sentiment = sid.polarity_scores(review)
        
    for k in sorted(sentiment):
        sentiments.append([sentiment['neg'], sentiment['pos'],
                           sentiment['neu'], sentiment['compound']]) 
        
data_te[['neg', 'pos', 'neu', 'compound']] = pd.DataFrame(sentiments)

100%|██████████| 8045/8045 [00:10<00:00, 773.91it/s]


In [43]:
data_tr.head(2)

Unnamed: 0,review_id,year,user_suggestion,preprocessed_title,developer,publisher,Team-Based,Sports,Local Co-Op,Turn-Based Strategy,...,NSFW,no_of_tags,preprocessed_reviews,preprocessed_overview,#words_in_review,#words_in_overview,neg,pos,neu,compound
0,1,2016,1,spooky_is_jump_scare_mansion,lag_studios,lag_studios,0.0,0.0,0.0,0.0,...,0.0,20,scared hearing creepy voices pause moment writ...,survive 1000 rooms cute terror break cuteness ...,428,251,0.173,0.196,0.631,0.2516
1,2,2016,1,spooky_is_jump_scare_mansion,lag_studios,lag_studios,0.0,0.0,0.0,0.0,...,0.0,20,best game better sam pepper youtube account 10...,survive 1000 rooms cute terror break cuteness ...,242,251,0.173,0.196,0.631,0.2516


In [44]:
print(data_tr.shape)
print(data_te.shape)

(17494, 176)
(8045, 175)


<h3> Encoding Numerical
    

In [45]:
from sklearn.preprocessing import Normalizer

In [104]:
#neg
scalar = Normalizer()
scalar.fit(data_tr['neg'].values.reshape(1,-1))

neg_train = scalar.transform(data_tr['neg'].values.reshape(1,-1)).reshape(-1,1)
neg_test = scalar.transform(data_te['neg'].values.reshape(1,-1)).reshape(-1,1)

print(neg_train.shape)
print(neg_test.shape)

(17494, 1)
(8045, 1)


In [105]:
#pos
scalar = Normalizer()
scalar.fit(data_tr['pos'].values.reshape(1,-1))

pos_train = scalar.transform(data_tr['pos'].values.reshape(1,-1)).reshape(-1,1)
pos_test = scalar.transform(data_te['pos'].values.reshape(1,-1)).reshape(-1,1)

print(pos_train.shape)
print(pos_test.shape)

(17494, 1)
(8045, 1)


In [106]:
#neu
scalar = Normalizer()
scalar.fit(data_tr['neu'].values.reshape(1,-1))

neu_train = scalar.transform(data_tr['neu'].values.reshape(1,-1)).reshape(-1,1)
neu_test = scalar.transform(data_te['neu'].values.reshape(1,-1)).reshape(-1,1)

print(neu_train.shape)
print(neu_test.shape)

(17494, 1)
(8045, 1)


In [107]:
#compound 
scalar = Normalizer()
scalar.fit(data_tr['compound'].values.reshape(1,-1))

compound_train = scalar.transform(data_tr['compound'].values.reshape(1,-1)).reshape(-1,1)
compound_test = scalar.transform(data_te['compound'].values.reshape(1,-1)).reshape(-1,1)

print(compound_train.shape)
print(compound_test.shape)

(17494, 1)
(8045, 1)


In [108]:
#no_of_tags
scalar = Normalizer()
scalar.fit(data_tr['no_of_tags'].values.reshape(1,-1))

no_of_tags_train = scalar.transform(data_tr['no_of_tags'].values.reshape(1,-1)).reshape(-1,1)
no_of_tags_test = scalar.transform(data_te['no_of_tags'].values.reshape(1,-1)).reshape(-1,1)

print(no_of_tags_train.shape)
print(no_of_tags_test.shape)

(17494, 1)
(8045, 1)


In [109]:
#words_in_review
scalar = Normalizer()
scalar.fit(data_tr['#words_in_review'].values.reshape(1,-1))

words_in_review_train = scalar.transform(data_tr['#words_in_review'].values.reshape(1,-1)).reshape(-1,1)
words_in_review_test = scalar.transform(data_te['#words_in_review'].values.reshape(1,-1)).reshape(-1,1)

print(words_in_review_train.shape)
print(words_in_review_test.shape)

(17494, 1)
(8045, 1)


In [110]:
#words_in_overview
scalar = Normalizer()
scalar.fit(data_tr['#words_in_overview'].values.reshape(1,-1))

words_in_overview_train = scalar.transform(data_tr['#words_in_overview'].values.reshape(1,-1)).reshape(-1,1)
words_in_overview_test = scalar.transform(data_te['#words_in_overview'].values.reshape(1,-1)).reshape(-1,1)

print(words_in_overview_train.shape)
print(words_in_overview_test.shape)

(17494, 1)
(8045, 1)


<h3> Encoding categorical

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
print(data_tr.shape)
print(data_te.shape)

(17494, 176)
(8045, 175)


In [55]:
Y = data_tr['user_suggestion']
data_tr.drop('user_suggestion',axis=1,inplace=True)

In [56]:
data = pd.concat([data_tr,data_te], axis=0)
data.shape

(25539, 175)

In [57]:
data.head(2)

Unnamed: 0,review_id,year,preprocessed_title,developer,publisher,Team-Based,Sports,Local Co-Op,Turn-Based Strategy,MMORPG,...,NSFW,no_of_tags,preprocessed_reviews,preprocessed_overview,#words_in_review,#words_in_overview,neg,pos,neu,compound
0,1,2016,spooky_is_jump_scare_mansion,lag_studios,lag_studios,0.0,0.0,0.0,0.0,0.0,...,0.0,20,scared hearing creepy voices pause moment writ...,survive 1000 rooms cute terror break cuteness ...,428,251,0.173,0.196,0.631,0.2516
1,2,2016,spooky_is_jump_scare_mansion,lag_studios,lag_studios,0.0,0.0,0.0,0.0,0.0,...,0.0,20,best game better sam pepper youtube account 10...,survive 1000 rooms cute terror break cuteness ...,242,251,0.173,0.196,0.631,0.2516


In [58]:
# we use count vectorizer to convert the values into one hot encoded features
vectorizer = CountVectorizer()
vectorizer.fit(data['developer'].values)

developer_one_hot_train = vectorizer.transform(data_tr['developer'].values)
developer_one_hot_test = vectorizer.transform(data_te['developer'].values)

print(developer_one_hot_train.shape)
print(developer_one_hot_test.shape)

(17494, 59)
(8045, 59)


In [59]:
vectorizer = CountVectorizer()
vectorizer.fit(data['publisher'].values)

publisher_one_hot_train = vectorizer.transform(data_tr['publisher'].values)
publisher_one_hot_test = vectorizer.transform(data_te['publisher'].values)

print(publisher_one_hot_train.shape)
print(publisher_one_hot_test.shape) 

(17494, 54)
(8045, 54)


In [60]:
vectorizer = CountVectorizer()
vectorizer.fit(data['preprocessed_title'].values)

title_one_hot_train = vectorizer.transform(data_tr['preprocessed_title'].values)
title_one_hot_test = vectorizer.transform(data_te['preprocessed_title'].values)

print(title_one_hot_train.shape)
print(title_one_hot_test.shape) 

(17494, 64)
(8045, 64)


In [61]:
vectorizer = CountVectorizer()
vectorizer.fit(data['year'].values)

year_one_hot_train = vectorizer.transform(data_tr['year'].values)
year_one_hot_test = vectorizer.transform(data_te['year'].values)

print(year_one_hot_train.shape)
print(year_one_hot_test.shape)

(17494, 8)
(8045, 8)


<h3> Encoding text features

<h4>bow

In [225]:
#tf-idf preprocessed_reviews
#from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = CountVectorizer(min_df=10,ngram_range=(1,2), max_features=5000)
vectorizer.fit(data_tr['preprocessed_reviews']) #fit

reviews_bow_train = vectorizer.transform(data_tr['preprocessed_reviews']) #transform
reviews_bow_test = vectorizer.transform(data_te['preprocessed_reviews']) #transform


print("Shape of matrix after one hot encodig ",reviews_bow_train.shape)
print("Shape of matrix after one hot encodig ",reviews_bow_test.shape)



Shape of matrix after one hot encodig  (17494, 5000)
Shape of matrix after one hot encodig  (8045, 5000)


In [226]:
#tf-idf overview
vectorizer = CountVectorizer(min_df=10,ngram_range=(1,2), max_features=5000)
vectorizer.fit(data_tr['preprocessed_overview']) #fit

overview_bow_train = vectorizer.transform(data_tr['preprocessed_overview']) #transform
overview_bow_test = vectorizer.transform(data_te['preprocessed_overview']) #transform


print("Shape of matrix after one hot encodig ",overview_bow_train.shape)
print("Shape of matrix after one hot encodig ",overview_bow_test.shape)

Shape of matrix after one hot encodig  (17494, 5000)
Shape of matrix after one hot encodig  (8045, 5000)


<h4>tfidf

In [82]:
#tf-idf preprocessed_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,2), max_features=5000)
vectorizer.fit(data_tr['preprocessed_reviews']) #fit

reviews_tfidf_train = vectorizer.transform(data_tr['preprocessed_reviews']) #transform
reviews_tfidf_test = vectorizer.transform(data_te['preprocessed_reviews']) #transform


print("Shape of matrix after one hot encodig ",reviews_tfidf_train.shape)
print("Shape of matrix after one hot encodig ",reviews_tfidf_test.shape)


Shape of matrix after one hot encodig  (17494, 5000)
Shape of matrix after one hot encodig  (8045, 5000)


In [83]:
#tf-idf overview
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,2), max_features=5000)
vectorizer.fit(data_tr['preprocessed_overview']) #fit

overview_tfidf_train = vectorizer.transform(data_tr['preprocessed_overview']) #transform
overview_tfidf_test = vectorizer.transform(data_te['preprocessed_overview']) #transform


print("Shape of matrix after one hot encodig ",overview_tfidf_train.shape)
print("Shape of matrix after one hot encodig ",overview_tfidf_test.shape)

Shape of matrix after one hot encodig  (17494, 5000)
Shape of matrix after one hot encodig  (8045, 5000)


<h4> w2v

In [177]:
#loading pre-trained glove vectors
import os

embeddings_index = {}
f = open(os.path.join('glove.6B.300d.txt'), encoding='utf8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

400000it [00:38, 10298.31it/s]


In [181]:
glove_words = embeddings_index.keys()

In [186]:
# average Word2Vec for train(essay)
# compute average word2vec for each review.
avg_w2v_vectors_review_train = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_tr['preprocessed_reviews']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if word in glove_words:
            vector += embeddings_index[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words #calculatig (w2v(word_i))/(number of words in that sentence)
    avg_w2v_vectors_review_train.append(vector) #stores the w2v for all sentences/reviews in the entire dataset

print(len(avg_w2v_vectors_review_train))
print(len(avg_w2v_vectors_review_train[0]))

avg_w2v_vectors_review_train = np.array(avg_w2v_vectors_review_train)
print(avg_w2v_vectors_review_train.shape)

100%|██████████| 17494/17494 [00:05<00:00, 3314.10it/s]

17494
300
(17494, 300)





In [187]:
# average Word2Vec for train(essay)
# compute average word2vec for each review.
avg_w2v_vectors_overview_train = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_tr['preprocessed_overview']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if word in glove_words:
            vector += embeddings_index[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words #calculatig (w2v(word_i))/(number of words in that sentence)
    avg_w2v_vectors_overview_train.append(vector) #stores the w2v for all sentences/reviews in the entire dataset

print(len(avg_w2v_vectors_overview_train))
print(len(avg_w2v_vectors_overview_train[0]))

avg_w2v_vectors_overview_train = np.array(avg_w2v_vectors_overview_train)
print(avg_w2v_vectors_overview_train.shape)

100%|██████████| 17494/17494 [00:09<00:00, 1803.10it/s]


17494
300
(17494, 300)


In [188]:
# average Word2Vec for test(essay)
# compute average word2vec for each review.
avg_w2v_vectors_review_test = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_te['preprocessed_reviews']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if word in glove_words:
            vector += embeddings_index[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words #calculatig (w2v(word_i))/(number of words in that sentence)
    avg_w2v_vectors_review_test.append(vector) #stores the w2v for all sentences/reviews in the entire dataset

print(len(avg_w2v_vectors_review_test))
print(len(avg_w2v_vectors_review_test[0]))

avg_w2v_vectors_review_test = np.array(avg_w2v_vectors_review_test)
print(avg_w2v_vectors_review_test.shape)

100%|██████████| 8045/8045 [00:02<00:00, 3640.82it/s]

8045
300
(8045, 300)





In [189]:
# average Word2Vec for test(essay)
# compute average word2vec for each review.
avg_w2v_vectors_overview_test = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_te['preprocessed_overview']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if word in glove_words:
            vector += embeddings_index[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words #calculatig (w2v(word_i))/(number of words in that sentence)
    avg_w2v_vectors_overview_test.append(vector) #stores the w2v for all sentences/reviews in the entire dataset

print(len(avg_w2v_vectors_overview_test))
print(len(avg_w2v_vectors_overview_test[0]))

avg_w2v_vectors_overview_test = np.array(avg_w2v_vectors_overview_test)
print(avg_w2v_vectors_overview_test.shape)

100%|██████████| 8045/8045 [00:04<00:00, 1609.28it/s]

8045
300
(8045, 300)





<h3> tfidf-w2v

In [209]:
#tfidf-w2v ESSAY
tfidf_model = TfidfVectorizer()
tfidf_model.fit(data_tr['preprocessed_overview'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

In [210]:
tfidf_w2v_vectors_overview_train = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_tr['preprocessed_overview']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = embeddings_index[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_overview_train.append(vector)

print(len(tfidf_w2v_vectors_overview_train))
print(len(tfidf_w2v_vectors_overview_train[0]))

tfidf_w2v_vectors_overview_train = np.array(tfidf_w2v_vectors_overview_train)
print(tfidf_w2v_vectors_overview_train.shape)


100%|██████████| 17494/17494 [01:39<00:00, 175.91it/s]

17494
300
(17494, 300)





In [211]:
tfidf_w2v_vectors_overview_test = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_te['preprocessed_overview']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = embeddings_index[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_overview_test.append(vector)

print(len(tfidf_w2v_vectors_overview_test))
print(len(tfidf_w2v_vectors_overview_test[0]))

tfidf_w2v_vectors_overview_test = np.array(tfidf_w2v_vectors_overview_test)
print(tfidf_w2v_vectors_overview_test.shape)



100%|██████████| 8045/8045 [00:32<00:00, 249.44it/s]

8045
300
(8045, 300)





In [212]:
#tfidf-w2v ESSAY
tfidf_model = TfidfVectorizer()
tfidf_model.fit(data_tr['preprocessed_reviews'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

In [213]:
tfidf_w2v_vectors_review_train = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_tr['preprocessed_reviews']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = embeddings_index[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_review_train.append(vector)

print(len(tfidf_w2v_vectors_review_train))
print(len(tfidf_w2v_vectors_review_train[0]))

tfidf_w2v_vectors_review_train = np.array(tfidf_w2v_vectors_review_train)
print(tfidf_w2v_vectors_review_train.shape)



100%|██████████| 17494/17494 [00:34<00:00, 501.01it/s]

17494
300
(17494, 300)





In [214]:
tfidf_w2v_vectors_review_test = [] # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(data_te['preprocessed_reviews']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = embeddings_index[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_review_test.append(vector)

print(len(tfidf_w2v_vectors_review_test))
print(len(tfidf_w2v_vectors_review_test[0]))

tfidf_w2v_vectors_review_test = np.array(tfidf_w2v_vectors_review_test)
print(tfidf_w2v_vectors_review_test.shape)

100%|██████████| 8045/8045 [00:16<00:00, 491.59it/s]

8045
300
(8045, 300)





<h3> Combining encoded data

In [85]:
from scipy.sparse import hstack

In [99]:
#extracting the tag columns to add in the final df
df_tr = data_tr[tags]
df_te = data_te[tags]

In [236]:
final_train = hstack((df_tr,developer_one_hot_train,publisher_one_hot_train,title_one_hot_train,year_one_hot_train,neg_train,pos_train, neu_train,compound_train,no_of_tags_train,words_in_review_train,words_in_overview_train,reviews_tfidf_train,overview_tfidf_train))
final_test = hstack((df_te,developer_one_hot_test,publisher_one_hot_test,title_one_hot_test,year_one_hot_test,neg_test,pos_test, neu_test,compound_test,no_of_tags_test,words_in_review_test,words_in_overview_test,reviews_tfidf_test,overview_tfidf_test))

print(final_train.shape)
print(final_test.shape)


(17494, 10353)
(8045, 10353)


In [237]:
print(Y.shape)

(17494,)


<h3>Modeling

In [228]:
def baseliner(train, y, cv=3, metric='accuracy'):
    """
    Function for baselining Models which return CV Score, Train Score, Valid Score
    """
    print("Baseliner Models\n")
    eval_dict = {}
    models = [lgb.LGBMClassifier(), xgb.XGBClassifier(),  GradientBoostingClassifier(), LogisticRegression(), 
              RandomForestClassifier(), DecisionTreeClassifier(), AdaBoostClassifier(),ExtraTreeClassifier(),ExtraTreesClassifier(),
              KNeighborsClassifier(),BaggingClassifier()
             ]
    print("Model Name \t |   CV")
    print("--" * 50)

    for index, model in enumerate(models, 0):
        model_name = str(model).split("(")[0]
        eval_dict[model_name] = {}

        results = cross_val_score(model, train, y, cv=cv, scoring=metric)
        eval_dict[model_name]['cv'] = results.mean()

        print("%s \t | %.4f \t" % (
            model_name[:12], eval_dict[model_name]['cv']))

In [229]:
baseliner(final_train, Y)

Baseliner Models

Model Name 	 |   CV
----------------------------------------------------------------------------------------------------
LGBMClassifi 	 | 0.6009 	
XGBClassifie 	 | 0.5358 	
GradientBoos 	 | 0.5241 	
LogisticRegr 	 | 0.6429 	
RandomForest 	 | 0.5043 	
DecisionTree 	 | 0.4787 	
AdaBoostClas 	 | 0.5141 	
ExtraTreeCla 	 | 0.4387 	
ExtraTreesCl 	 | 0.5078 	
KNeighborsCl 	 | 0.4819 	
BaggingClass 	 | 0.5009 	


In [301]:
model = LogisticRegression(C=100, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=200,
                                                multi_class='warn', n_jobs=None,
                                                penalty='l2', random_state=0,
                                                solver='saga', tol=0.01,
                                                verbose=0, warm_start=False)
model.fit(final_train,Y) 

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='saga', tol=0.01, verbose=0,
                   warm_start=False)

In [302]:
#clf

In [303]:
predictions = model.predict(final_test) 

In [304]:
predictions

array([1, 0, 0, ..., 0, 0, 1])

<h3> Submission

In [305]:
ids= data_te['review_id'].tolist()

In [306]:
sub = pd.DataFrame()
sub['review_id'] = ids
sub['user_suggestion'] = predictions

In [307]:
sub.to_csv("lr_l2_01.csv",index=False)

In [279]:
sub

Unnamed: 0,review_id,user_suggestion
0,1603,0
1,1604,0
2,1605,0
3,1606,0
4,1607,1
...,...,...
8040,25198,1
8041,25199,0
8042,25200,0
8043,25201,1


<h3> Accuracy of 84.7% was received after submission of the file