# Text classification with SVM and naive bayes on Amazon Cellphone Review Dataset


In [1]:
# text classification with svm and naive bayes
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('wordnet')
# initialize a random seed
np.random.seed(500)

[nltk_data] Downloading package punkt to /Users/ghost/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ghost/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# import amazon dataset
Amazon_Reviews = pd.read_csv("amazon.csv",encoding='latin-1')
print(Amazon_Reviews.head())

                                       product_title  \
0  Mobile Action MA730 Handset Manager - Bluetoot...   
1  Mobile Action MA730 Handset Manager - Bluetoot...   
2  Mobile Action MA730 Handset Manager - Bluetoot...   
3   USB Data Cable for Sony-Ericsson Z600, Z500, ...   
4   USB Data Cable for Sony-Ericsson Z600, Z500, ...   

                                      review_summary  \
0                                         Don't buy!   
1  Mobile Action Bluetooth Mobile Phone Tool Soft...   
2                                               good   
3                        No instructions included...   
4                                   NOT A DATA CABLE   

                                         review_text  review_score  
0   First of all, the company took my money and s...             1  
1  Great product- tried others and this is a ten ...             5  
2  works real good....a little hard to set up...w...             4  
3   The price was right for this cable ($11.95+$4.

# text preprocessing


In [None]:

# remove blank rows if any.
Amazon_Reviews['review_text'].dropna(inplace=True)
# make all the text to lower case.  
Amazon_Reviews['review_text']  = [entry.lower() for entry in Amazon_Reviews['review_text']]
# tokenization each entry in the corpus will be broken into set of words
Amazon_Reviews['review_text'] = [word_tokenize(entry) for entry in Amazon_Reviews['review_text'] ]
# remove stopwords, non-numeric and perfom word stemming/lemmenting (use WordNetLemmatizer)  
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Amazon_Reviews['review_text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        # check for stopwords and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # final processed set of words for each iteration 
    Amazon_Reviews.loc[index,'text_final'] = str(Final_words)

# Prepare Train and Test Data sets


In [None]:
# labels are Amazon product names
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Amazon_Reviews['text_final'],Amazon_Reviews['product_title'],test_size=0.3)



# Encoding

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# Word Vectorization 

In [None]:
# TF-IDF
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [None]:
# print(Tfidf_vect.vocabulary_)
# see what vectorized data looks like
print(Train_X_Tfidf)


# Prediction

In [None]:
# Naive-Bayes Classifier

# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# get the accuracy
print("Naive Bayes Accuracy Score: ",accuracy_score(predictions_NB, Test_Y)*100)


In [None]:
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score: ",accuracy_score(predictions_SVM, Test_Y)*100)
