In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from collections import Counter

#[1] Importing dataset

dataset = pd.read_json(r"C:\Users\Panos\Desktop\Dissert\Code\Sample_Video_Games_5.json", lines=True, encoding='latin-1')
dataset = dataset[['reviewText','overall']]

#[2] Reduce number of classes

ratings = []
for index,entry in enumerate(dataset['overall']):
    if entry == 1.0 or entry == 2.0:
        ratings.append(-1)
    elif entry == 3.0:
        ratings.append(0)
    elif entry == 4.0 or entry == 5.0:
        ratings.append(1)

In [4]:
#[3] Cleaning the text & lemmatization

import multiprocessing
from joblib import Parallel, delayed

# Step - a : Remove blank rows if any.
dataset['reviewText'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
dataset['reviewText'] = [entry.lower() for entry in dataset['reviewText']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
dataset['reviewText'] = [word_tokenize(entry) for entry in dataset['reviewText']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

num_cores = multiprocessing.cpu_count()
#inputs = myList

dataset = Parallel(n_jobs=num_cores)(delayed(my_function(i,parameters) 
                                                        for i in inputs)
        
def lemmatizaion(dataset['reviewText']):
    for index,entry in enumerate(dataset['reviewText']):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
            # The final processed set of words for each iteration will be stored in 'text_final'
            dataset.loc[index,'text_final'] = str(Final_words)
    return(dataset)

In [5]:
#[4] Prepare Train and Test Data sets
            
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(dataset['text_final'],ratings,test_size=0.3)

print(Counter(Train_Y).values()) # counts the elements' frequency

dict_values([49, 15, 6])


In [6]:
#[5] Encoding

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [7]:
#[6] Word Vectorization
        
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(dataset['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

#the vocabulary that it has learned from the corpus
#print(Tfidf_vect.vocabulary_)

# the vectorized data
#print(Train_X_Tfidf)

In [8]:
#[7] Use the Naive Bayes Algorithms to Predict the outcome

# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("-----------------------Naive Bayes------------------------\n")
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Test_Y, predictions_NB)
print("\n",cm,"\n")
# Printing a classification report of different metrics
from sklearn.metrics import classification_report
my_tags = ['Positive','Neutral','Negative']
print(classification_report(Test_Y, predictions_NB,target_names=my_tags,zero_division = 0))

# Export reports to files for later visualizations
report_NB = classification_report(Test_Y, predictions_NB,target_names=my_tags, output_dict=True)
report_NB_df = pd.DataFrame(report_NB).transpose()
report_NB_df.to_csv(r'NB_report_TFIDFVect_Lemmatization.csv', index = True, float_format="%.3f")

-----------------------Naive Bayes------------------------

Naive Bayes Accuracy Score ->  70.0

 [[ 0  0  5]
 [ 0  0  4]
 [ 0  0 21]] 

              precision    recall  f1-score   support

    Positive       0.00      0.00      0.00         5
     Neutral       0.00      0.00      0.00         4
    Negative       0.70      1.00      0.82        21

    accuracy                           0.70        30
   macro avg       0.23      0.33      0.27        30
weighted avg       0.49      0.70      0.58        30



  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
#[8] Use the Support Vector Machine Algorithms to Predict the outcome

# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("-----------------Support Vector Machine CM------------------\n")
print("Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
cm = confusion_matrix(Test_Y, predictions_SVM)
# Making the confusion matrix
print("\n",cm,"\n")
# Printing a classification report of different metrics
print(classification_report(Test_Y, predictions_SVM,target_names=my_tags))

# Export reports to files for later visualizations
report_SVM = classification_report(Test_Y, predictions_SVM,target_names=my_tags, output_dict=True)
report_SVM_df = pd.DataFrame(report_SVM).transpose()
report_SVM_df.to_csv(r'SVM_report_TFIDFVect_Lemmatization.csv', index = True, float_format="%.3f")

-----------------Support Vector Machine CM------------------

Accuracy Score ->  70.0

 [[ 0  0  5]
 [ 0  0  4]
 [ 0  0 21]] 

              precision    recall  f1-score   support

    Positive       0.00      0.00      0.00         5
     Neutral       0.00      0.00      0.00         4
    Negative       0.70      1.00      0.82        21

    accuracy                           0.70        30
   macro avg       0.23      0.33      0.27        30
weighted avg       0.49      0.70      0.58        30

