## Pre-processing

In [1]:
#https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from collections import Counter

#import dill

if __name__ == "__main__":
    
        # Reproduce the same result every time if the script is kept consistent otherwise each run will produce different results
        np.random.seed(500)
    
        #[1] Read the data
        Corpus = pd.read_json(r"C:\Users\Panos\Desktop\Dissert\Code\Sample_Video_Games_5.json", lines=True, encoding='latin-1')
        Corpus = Corpus[['reviewText','overall']]
        
        # Print some info
        Corpus.info()
        print(Corpus.overall.value_counts())
        
        #[1.5] Reduce number of classes
        for index,entry in enumerate(Corpus['overall']):
             if entry == 1.0 or entry == 2.0:
                 Corpus.loc[index,'overall_final'] = -1
             elif entry == 3.0:
                 Corpus.loc[index,'overall_final'] = 0
             elif entry == 4.0 or entry == 5.0:
                 Corpus.loc[index,'overall_final'] = 1
                
        #[2] Preprocessing
        
        # Step - a : Remove blank rows if any.
        Corpus['reviewText'].dropna(inplace=True)
        # Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
        Corpus['reviewText'] = [entry.lower() for entry in Corpus['reviewText']]
        # Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
        Corpus['reviewText'] = [word_tokenize(entry) for entry in Corpus['reviewText']]
        # Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
        # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
        tag_map = defaultdict(lambda : wn.NOUN)
        tag_map['J'] = wn.ADJ
        tag_map['V'] = wn.VERB
        tag_map['R'] = wn.ADV
        for index,entry in enumerate(Corpus['reviewText']):
            # Declaring Empty List to store the words that follow the rules for this step
            Final_words = []
            # Initializing WordNetLemmatizer()
            word_Lemmatized = WordNetLemmatizer()
            # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
            for word, tag in pos_tag(entry):
                # Below condition is to check for Stop words and consider only alphabets
                if word not in stopwords.words('english') and word.isalpha():
                    word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                    Final_words.append(word_Final)
            # The final processed set of words for each iteration will be stored in 'text_final'
            Corpus.loc[index,'text_final'] = str(Final_words)
            
        #Print the first 3 rows
        print(Corpus.iloc[:3])
        print("hey yo")
        
        #dill.dump_session('notebook_env.db')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
reviewText    100 non-null object
overall       100 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.7+ KB
5    53
4    17
1    15
3    10
2     5
Name: overall, dtype: int64
                                          reviewText  overall  overall_final  \
0  [installing, the, game, was, a, struggle, (, b...        1           -1.0   
1  [if, you, like, rally, cars, get, this, game, ...        4            1.0   
2  [1st, shipment, received, a, book, instead, of...        1           -1.0   

                                          text_final  
0  ['instal', 'game', 'struggle', 'game', 'window...  
1  ['like', 'rally', 'car', 'get', 'game', 'orien...  
2  ['shipment', 'receive', 'book', 'instead', 'sh...  
hey yo


In [4]:
        #[3] Prepare Train and Test Data sets
            
        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['overall_final'],test_size=0.3)
        
        print(Counter(Train_Y).values()) # counts the elements' frequency
        
        #[4] Encoding
        
        Encoder = LabelEncoder()
        Train_Y = Encoder.fit_transform(Train_Y)
        Test_Y = Encoder.fit_transform(Test_Y)
        
        #[5] Word Vectorization
        
        Tfidf_vect = TfidfVectorizer(max_features=10000)
        Test_X_Tfidf = Tfidf_vect.fit_transform(Corpus['text_final'])
        Train_X_Tfidf = Tfidf_vect.transform(Train_X)
        Test_X_Tfidf = Tfidf_vect.transform(Test_X)

dict_values([48, 5, 17])


In [5]:
        #[6] SMOTE (Synthetic Minority Over-Sampling Technique)
        from imblearn.under_sampling import NearMiss, RandomUnderSampler
        
        nm = NearMiss(ratio='not minority',random_state=777, version=1, n_neighbors=1)
        X_nm, y_nm = nm.fit_sample(Train_X_Tfidf, Train_Y)
        
        print(Counter(y_nm).values()) # counts the elements' frequency

dict_values([5, 5, 5])


In [None]:
        the vocabulary that it has learned from the corpus
        print(Tfidf_vect.vocabulary_)
        
        the vectorized data
        print(Train_X_Tfidf)

In [6]:
        #[7] Use the ML Algorithms to Predict the outcome
        
        # fit the training dataset on the NB classifier
        Naive = naive_bayes.MultinomialNB()
        Naive.fit(X_nm,y_nm)
        # predict the labels on validation dataset
        predictions_NB = Naive.predict(Test_X_Tfidf)
        # Use accuracy_score function to get the accuracy
        print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
        # Making the confusion matrix
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(Test_Y, predictions_NB)
        print("-----------------cm------------------")
        print(cm)
        print("-------------------------------------")
        
        #[8] Support Vector Machine
        
        # Classifier - Algorithm - SVM
        # fit the training dataset on the classifier
        SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
        SVM.fit(X_nm,y_nm)
        # predict the labels on validation dataset
        predictions_SVM = SVM.predict(Test_X_Tfidf)
        # Use accuracy_score function to get the accuracy
        print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

Naive Bayes Accuracy Score ->  46.666666666666664
-----------------cm------------------
[[3 0 0]
 [1 2 2]
 [7 6 9]]
-------------------------------------
SVM Accuracy Score ->  36.666666666666664


In [6]:
#A try to parallelize the for loop
# #https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

# import pandas as pd
# import numpy as np
# from nltk.tokenize import word_tokenize
# from nltk import pos_tag
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# from sklearn.preprocessing import LabelEncoder
# from collections import defaultdict
# from nltk.corpus import wordnet as wn
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn import model_selection, naive_bayes, svm
# from sklearn.metrics import accuracy_score
# from collections import Counter

# import multiprocessing
# from joblib import Parallel, delayed

# if __name__ == "__main__":
    
#         # Reproduce the same result every time if the script is kept consistent otherwise each run will produce different results
#         np.random.seed(500)
    
#         #[1] Read the data
#         Corpus = pd.read_json(r"C:\Users\Panos\Desktop\Dissert\Code\Sample_Video_Games_5.json", lines=True, encoding='latin-1')
#         Corpus = Corpus[['reviewText','overall']]
        
#         # Print some info
#         Corpus.info()
#         print(Corpus.overall.value_counts())
        
#         #https://medium.com/@mjschillawski/quick-and-easy-parallelization-in-python-32cb9027e490
#         num_cores = multiprocessing.cpu_count()
        
#         processed_list = Parallel(n_jobs=num_cores)(delayed(my_function(i,parameters) 
#                                                         for i in enumerate(Corpus['overall'])
                
#         #[2] Preprocessing
        
#         # Step - a : Remove blank rows if any.
#         Corpus['reviewText'].dropna(inplace=True)
#         # Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
#         Corpus['reviewText'] = [entry.lower() for entry in Corpus['reviewText']]
#         # Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
#         Corpus['reviewText'] = [word_tokenize(entry) for entry in Corpus['reviewText']]
#         # Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
#         # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
#         tag_map = defaultdict(lambda : wn.NOUN)
#         tag_map['J'] = wn.ADJ
#         tag_map['V'] = wn.VERB
#         tag_map['R'] = wn.ADV
#         for index,entry in enumerate(Corpus['reviewText']):
#             # Declaring Empty List to store the words that follow the rules for this step
#             Final_words = []
#             # Initializing WordNetLemmatizer()
#             word_Lemmatized = WordNetLemmatizer()
#             # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
#             for word, tag in pos_tag(entry):
#                 # Below condition is to check for Stop words and consider only alphabets
#                 if word not in stopwords.words('english') and word.isalpha():
#                     word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
#                     Final_words.append(word_Final)
#             # The final processed set of words for each iteration will be stored in 'text_final'
#             Corpus.loc[index,'text_final'] = str(Final_words)
            
#         #Print the first 3 rows
#         print(Corpus.iloc[:3])
#         print("hey yo")
                                                            
# def my_function():                                          
# #[1.5] Reduce number of classes
#         for index,entry in enumerate(Corpus['overall']):
#              if entry == 1.0 or entry == 2.0:
#                  Corpus.loc[index,'overall_final'] = -1
#              elif entry == 3.0:
#                  Corpus.loc[index,'overall_final'] = 0
#              elif entry == 4.0 or entry == 5.0:
#                  Corpus.loc[index,'overall_final'] = 1

SyntaxError: invalid syntax (<ipython-input-6-42206eafcea6>, line 42)