In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
import pickle

## Load Dataset

In [2]:
Corpus = pd.read_csv('dataset.csv', encoding="latin-1")
print(Corpus)

                                                   text        label
0      Stuning even for the non-gamer: This sound tr...  __label__2 
1      The best soundtrack ever to anything.: I'm re...  __label__2 
2      Amazing!: This soundtrack is my favorite musi...  __label__2 
3      Excellent Soundtrack: I truly like this sound...  __label__2 
4      Remember, Pull Your Jaw Off The Floor After H...  __label__2 
...                                                 ...          ...
9995   A revelation of life in small town America in...  __label__2 
9996   Great biography of a very interesting journal...  __label__2 
9997   Interesting Subject; Poor Presentation: You'd...  __label__1 
9998   Don't buy: The box looked used and it is obvi...  __label__1 
9999   Beautiful Pen and Fast Delivery.: The pen was...  __label__2 

[10000 rows x 2 columns]


## Data Pre-processing

In [3]:
Corpus['text'].dropna(inplace=True)


In [4]:
Corpus['text'] = [entry.lower() for entry in Corpus['text']]


In [5]:
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]


In [6]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
print("tag_map", tag_map)

tag_map defaultdict(<function <lambda> at 0x1a1c608c20>, {'J': 'a', 'V': 'v', 'R': 'r'})


In [8]:
for index,entry in enumerate(Corpus['text']):

    Final_words = []

    word_Lemmatized = WordNetLemmatizer()

    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    Corpus.loc[index,'text_final'] = str(Final_words)
print(Corpus.head())

                                                text        label  \
0  [stuning, even, for, the, non-gamer, :, this, ...  __label__2    
1  [the, best, soundtrack, ever, to, anything, .,...  __label__2    
2  [amazing, !, :, this, soundtrack, is, my, favo...  __label__2    
3  [excellent, soundtrack, :, i, truly, like, thi...  __label__2    
4  [remember, ,, pull, your, jaw, off, the, floor...  __label__2    

                                          text_final  
0  ['stun', 'even', 'sound', 'track', 'beautiful'...  
1  ['best', 'soundtrack', 'ever', 'anything', 're...  
2  ['amaze', 'soundtrack', 'favorite', 'music', '...  
3  ['excellent', 'soundtrack', 'truly', 'like', '...  
4  ['remember', 'pull', 'jaw', 'floor', 'hear', '...  


## Splitting into training and testing

In [9]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.25)

## Word Vectorization ( Tf-Idf )

In [10]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [11]:
print(Corpus['text_final'][0:5])

0    ['stun', 'even', 'sound', 'track', 'beautiful'...
1    ['best', 'soundtrack', 'ever', 'anything', 're...
2    ['amaze', 'soundtrack', 'favorite', 'music', '...
3    ['excellent', 'soundtrack', 'truly', 'like', '...
4    ['remember', 'pull', 'jaw', 'floor', 'hear', '...
Name: text_final, dtype: object


In [12]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_fit = Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


## Saving the vectorizer locally

In [13]:
pickle.dump(Tfidf_fit,open("feature2.pkl","wb"))


## SVM model

In [14]:
model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)
model.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = model.predict(Test_X_Tfidf)
print("Accuracy: ",accuracy_score(predictions_SVM, Test_Y)*100)

Accuracy:  83.8


## Saving model locally

In [15]:
pickle.dump(model, open("saved_model", 'wb'))

## Testing new sentence

In [16]:
# initializing dataframe. Needs to be done to match with the input parameter of Tfidf_vect
p = pd.DataFrame({"new_text":[]})


In [17]:
def preprocessTesting(sentence):

    sentence = [entry.lower() for entry in sentence]
    sentence = [word_tokenize(entry) for entry in sentence]
    
    for index,entry in enumerate(sentence):

        Final_words = []

        word_Lemmatized = WordNetLemmatizer()

        for word, tag in pos_tag(entry):
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        p.loc[index,'new_text'] = str(Final_words)
    return p
t = preprocessTesting(["A fascinating insight into the life of modern Japanese teens: I thoroughly enjoyed Rising Sons and Daughters. I don't know of any other book that looks at Japanese society from the point of view of its young people poised as they are between their parents' age-old Japanese culture of restraint and obedience to the will of the community, and their peers' adulation of Western culture. True to form, the 'New Young' of Japan seem to be creating an 'international' blend, as the Ando family demonstrates in this beautifully written book of vignettes of the private lives of members of this family. Steven Wardell is clearly a talented young author, adopted for some of his schooling into this family of four teens, and thus able to view family life in Japan from the inside out. A great read!"])
print(t)
       

                                            new_text
0  ['fascinating', 'insight', 'life', 'modern', '...


## Loading the vectorizer

In [18]:
load_vect = pickle.load(open("feature2.pkl", "rb"))
model = pickle.load(open("saved_model", 'rb'))



In [19]:
sent_tfidf = load_vect.transform(t["new_text"])


## Predicting

In [20]:
confidence = model.predict_proba(sent_tfidf)
print("Confidence Score for Class 1 = ", confidence[0][0])
print("Confidence Score for Class 2 = ", confidence[0][1])

Confidence Score for Class 1 =  0.017474610406846267
Confidence Score for Class 2 =  0.9825253895931537
