# Create Data Set by TF-IDF model

First, let's define a function to preprocess frases using stemming and removing stopwords

In [1]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
porter_stemmer = PorterStemmer()
en_stops = set(stopwords.words('english'))

#binary search to improve efficency
def binary_search(e,l,inizio,fine):
  if inizio>fine or e<l[inizio] or e>l[fine-1]:
    return False
  else:
    mezzo=int((inizio+fine)/2)
    m=l[mezzo]
    if e==m:
      return True
    elif e>m:
      return binary_search(e,l,mezzo,fine)
    else:
      return binary_search(e,l,inizio,mezzo)

#removing symbols and adding space after them
def prepreprocess(frase):
  s=''
  syms=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','è','é','à','ò','ì','ù']
  for char in frase:
    if char not in syms:
      s+=' '
    else:
      s+=char
  return s

#preprocess using prepreprocess fase and stemming and removing stopwords
def preprocess(frase,en_words):
  l=[]
  for word in prepreprocess(frase.lower()).split(' '):
    w=word.lower()
    if w not in en_stops and len(w)>2 and binary_search(w,en_words,0,len(en_words)):
        l.append(porter_stemmer.stem(w))
  return repr(' '.join(l))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Let's load now the data frame, saved as a csv file.

In [2]:
import pandas as pd

df=pd.read_csv('dataset/reviews_Video_Games_5.csv')

Now we can preprocess the whole data set using previous function. Preprocess is calculated on the concatenation of summary and review text and result is added in a new column ('text').

In [0]:
import csv

# en_words=sorted(open('/content/drive/My Drive/uni/ml/progetto/en_words.txt', 'r').read().lower().split('\n'))

# df['text']=df['summary']+' '+df['reviewText']
# df['text']=df['text'].apply(lambda x:preprocess(str(x.lower()),en_words))

en_words=sorted(open('/content/drive/My Drive/uni/ml/progetto/en_words.txt', 'r').read().lower().split('\n'))
reader = csv.reader(open('/content/drive/My Drive/uni/ml/progetto/reviews_Video_Games_5.csv','rt'))

new_df=[]
tags=next(reader)
for row in reader:
  row[9]=preprocess(str(row[6]+' '+row[4]),en_words)
  new_df.append(row)

df=pd.DataFrame(new_df, columns=tags)

This is only for the first time. The result of previous code is a new data frame that contains a new field where there is the result of preprocessing. In order to don't repeat it another time (preprocess is too slow) we saved it on a new csv file from which we can start the rest of the computation.

In [0]:
df.to_csv('/content/drive/My Drive/uni/ml/progetto/reviews_Video_Games_5.csv', index=False)

We now need to create the vocabulary and start the counting process. We can use the CountVectorizer to create a vocabulary from all the text in our df['text'] followed by the counts of words in the vocabulary.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
  """load stop words """    
  with open(stop_file_path, 'r', encoding="utf-8") as f:
    stopwords = f.readlines()
    stop_set = set(m.strip() for m in stopwords)
  return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words('/content/drive/My Drive/uni/ml/progetto/stopwords.txt')

#get the text column 
docs=df['text'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 99,999999% of documents, 
#eliminate stop words
cv=CountVectorizer(stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

Its now time to compute the IDF values.

In [0]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf = TfidfVectorizer()
# features = tfidf.fit_transform(df['text'].tolist())

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

We are now ready to compute TF-IDF and then extract top keywords from the TF-IDF vectors. First, let's separate data set from test set to extract top keyword

In [0]:
import csv
import pandas as pd
import random

ds=[]
ts=[]

reader = csv.reader(open('/content/drive/My Drive/uni/ml/progetto/reviews_Video_Games_5.csv','rt'))

tags=next(reader)
for row in reader:
  if random.randint(1,40)>1:
    ds.append(row)
  else:
    ts.append(row)
    
dataset=pd.DataFrame(ds, columns=tags)
testset=pd.DataFrame(ts, columns=tags)

# get test docs into a list
docs_test=testset['text'].tolist()

The next step is to compute the tf-idf value for a given document in our test set that generates a vector of tf-idf scores. Next, we sort the words in the vector in descending order of tf-idf values and then iterate over to extract the top-n keywords.We are extracting keywords for the first document in our test set.

In [0]:
def sort_coo(coo_matrix):
  tuples = zip(coo_matrix.col, coo_matrix.data)
  return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
  """get the feature names and tf-idf score of top n items"""
  #use only topn items from vector
  sorted_items = sorted_items[:topn] 
  score_vals = []
  feature_vals = []
  # word index and corresponding tf-idf score
  for idx, score in sorted_items:
    #keep track of feature name and its corresponding score
    score_vals.append(round(score, 3))
    feature_vals.append(feature_names[idx])
  #create a tuples of feature,score
  #results = zip(feature_vals,score_vals)
  results= {}
  for idx in range(len(feature_vals)):
    results[feature_vals[idx]]=score_vals[idx]    
  return results

In [0]:
# dizionario={}
# for e in cv.get_feature_names():
#   dizionario[e]=0.0

Now we are ready to compute tf-idf values for words in each row. We create a list which each element is a dictionary that contains the word as a key and the values is the tf-idf value of that word. Each dictionary is reffered to a single review.

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_names=cv.get_feature_names()

i=0
dicts=[]
for doc in df['text'].tolist():
  if(i%10000==0):
    print(i)
  #generate tf-idf for the given document
  tf_idf_vector=tfidf_transformer.transform(cv.transform([doc])) 
  #sort the tf-idf vectors by descending order of scores
  sorted_items=sort_coo(tf_idf_vector.tocoo()) 
  #extract all items from document
  keywords=extract_topn_from_vector(feature_names,sorted_items,len(doc))
  d={}
  for e in cv.get_feature_names():
    if e in keywords:
      d[e]=keywords[e]
  dicts.append(d)
  i+=1

In [0]:
len(cv.get_feature_names())

35681

In order to don't create a data frame with 35681 attributes, we take only the top 100 words most used in all reviews. 

In [0]:
def min_list(l):
  m=l[0]
  for e in l:
    if e<m:
      m=e
  return m

def remove_value(d,v):
  keys=d.keys()
  for e in keys:
    if d[e]==v:
      del d[e]
      break
      
keycount={}
for e in cv.get_feature_names():
  keycount[e]=0
for d in dicts:
  keys=d.keys()
  for k in keys:
    keycount[k]+=1

minv=0
top100={}
for k in keycount.keys():
  if len(top100)<100:
    top100[k]=keycount[k]
    minv=min_list(list(top100.values()))
  else:
    if keycount[k]>minv:
      remove_value(top100,minv)
      top100[k]=keycount[k]
      minv=min_list(list(top100.values()))

In [0]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # tfidf = TfidfVectorizer()
# # features = tfidf.fit_transform(df['text'].tolist())

# # you only needs to do this once, this is a mapping of index to 
# feature_names=cv.get_feature_names()
 
# # get the document that we want to extract keywords from
# r=random.randint(0,len(ts))
# doc=docs_test[r]
 
# #generate tf-idf for the given document
# tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
 
# #sort the tf-idf vectors by descending order of scores
# sorted_items=sort_coo(tf_idf_vector.tocoo())
 
# #extract only the top n; n here is 10
# keywords=extract_topn_from_vector(feature_names,sorted_items,len(doc))
 
# # now print the results
# print("\n=====Doc=====")
# print(r,len(doc.split(' ')),doc)
# print("\n===Keywords===")
# for k in keywords:
#   print(k,keywords[k])
# print(len(keywords))

We are now ready to create the data frame which: first column is the review text, the second one is the utility (calculated as the ratio between usefull rates and total rate and is 1 if this ratio is more equal than 0.7, 0 otherwise), the other columns are tf-idf value of each word in the columns in each review (if the word isn't in the review text value is 0).

In [0]:
import csv
import pandas as pd

reader = csv.reader(open('/content/drive/My Drive/uni/ml/progetto/reviews_Video_Games_5.csv','rt'))

dataframe=[]
tags=['text','utility']

for e in list(top100.keys()):
  tags.append(e)

def utility(a,b):
  if b==0.0: return 0.0
  return a/b

i=0
next(reader)
for row in reader:
  r=[]
  r.append(row[9])
  u=utility(float(eval(row[3])[0]),float(eval(row[3])[1]))
  if u>=0.7:
    r.append(1)
  else:
    r.append(0)
  for e in top100.keys():
    if e in dicts[i].keys():
      r.append(dicts[i][e])
    else:
      r.append(0)
  dataframe.append(r)
  i+=1

This is only for the first time we create the data frame.

In [0]:
DF=pd.DataFrame(dataframe, columns=tags)
DF.to_csv('/content/drive/My Drive/uni/ml/progetto/reviews_Video_Games_5-data_frame.csv', index=False)

# Classification using Naive Bayes

In [3]:
import pandas as pd

DF=pd.read_csv('dataset/reviews_Video_Games_5-data_frame.csv')
DF.head()

Unnamed: 0,text,utility,actual,anoth,awesom,back,bad,best,big,bit,...,time,tri,version,way,weapon,well,work,world,worth,year
0,'pay unlock content think instal game struggl ...,0,0.0,0.058,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,'good ralli game like ralli car get game fun o...,0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.084,0.0,0.0,0.0
2,'wrong key shipment receiv book instead game s...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,'awesom game crash frequent got version instea...,1,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,...,0.073,0.05,0.152,0.0,0.0,0.021,0.024,0.0,0.028,0.054
4,'dirt dirt okay game start play game laptop bo...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
DF.tail()

Unnamed: 0,text,utility,actual,anoth,awesom,back,bad,best,big,bit,...,time,tri,version,way,weapon,well,work,world,worth,year
231775,'rate system seller funni peopl rate seller ri...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231776,'get bundl includ extra wheel control delux ma...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231777,'fake bundl packag red show steer wheel retail...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231778,'look like gouger get packag mine arriv box red',1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231779,'buy look european version real retail store t...,1,0.088,0.0,0.0,0.0,0.0,0.073,0.0,0.0,...,0.0,0.08,0.194,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
import csv

data=[]
target=[]

reader=csv.reader(open('dataset/reviews_Video_Games_5-data_frame.csv','rt'))

next(reader)
for row in reader:
  target.append(row[1])
  l=[]
  for e in row[2:]:
    l.append(float(e))
  data.append(l)

In [6]:
from sklearn.naive_bayes import GaussianNB

gnb=GaussianNB()
pred=gnb.fit(data,target).predict(data)

In [7]:
print("Number of mislabeled points out of a total %d points : %d" % (len(data),(target != pred).sum()))

Number of mislabeled points out of a total 231780 points : 100682


In [6]:
def compute_tfidf(doc,cv,tfidf_transformer,attributes):
  #generate tf-idf for the given document
  tf_idf_vector=tfidf_transformer.transform(cv.transform([doc])) 
  #sort the tf-idf vectors by descending order of scores
  sorted_items=sort_coo(tf_idf_vector.tocoo())
  #extract all items from document
  keywords=extract_topn_from_vector(feature_names,sorted_items,len(doc))
  dict_doc={}
  for e in attributes:
    if e in keywords:
      dict_doc[e]=keywords[e]
    else:
      dict_doc[e]=0
  return dict_doc

In [29]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):
  """load stop words """    
  with open(stop_file_path, 'r', encoding="utf-8") as f:
    stopwords = f.readlines()
    stop_set = set(m.strip() for m in stopwords)
  return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words('stopwords.txt')

#get the text column 
docs=df['text'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 99,999999% of documents, 
#eliminate stop words
cv=CountVectorizer(stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

  'stop_words.' % sorted(inconsistent))


TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [30]:
summary='Great game of all the time!!!'
reviewText="I've never played this game first, but since the initial install I've fallen in love <3 !!! It's amazing because shooting mode is very easy and level of entertainment is fantastic! I'll play for hour and hour so long!!! Very recommended!!!!"

# summary='hahah'
# reviewText="I like obama because he loves me so much and other people wanna play this game"

en_words=sorted(open('en_words.txt', 'r').read().lower().split('\n'))
doc=preprocess(str(summary+' '+reviewText),en_words)

cv=CountVectorizer(stop_words=stopwords)
# feature_names=cv.get_feature_names()

dict_doc=compute_tfidf(doc,cv,tfidf_transformer,list(DF.columns[2:]))

data_doc=[]
for v in dict_doc.values():
  data_doc.append(float(v))

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.

In [0]:
import numpy as np

gnb_pf=GaussianNB()
gnb_pf.partial_fit(data,target,np.unique(target))
print(gnb_pf.predict([data_doc]))

['0']


# Classification using Neural Network

In [9]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn import datasets

Using TensorFlow backend.


In [10]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [11]:
# load dataset
dataframe = pd.read_csv('dataset/reviews_Video_Games_5-data_frame.csv')#, header=None)
dataset = dataframe.values
X = dataset[:,2:].astype(float)
Y = dataset[:,1]

In [12]:
X

array([[0.   , 0.058, 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       ...,
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.088, 0.   , 0.   , ..., 0.   , 0.   , 0.   ]])

In [13]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [14]:
# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(8, input_dim=100, activation='relu'))
	model.add(Dense(2, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [15]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)

In [16]:
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [17]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Baseline: 70.74% (0.21%)
