# Functions working:
-     loadDataset: load the dataset 
-     preprocessing: convert into lower cases + tokenization + lemmatization
-     split: split dataset into 75% training and 25% testing.
-     initialize_ideal_answer: define ideal_answer
-     encode_y = coverting y lables(categorical) to numbers(0/1)
-     word_overlap_score: find similarity by comparing Training examples with an ideal answer
-     alignment: returns features, which includes scores for each training examples.
-     get_params: define parameters for the model.
-     set_params: set the parameters of the model.
-     trian: train the model
-     predict: performance on the testing dataset.
-     score: finding accuracy
-     save: save the model locally
-     load: load the model from local
-     predict_probabilities: find confidence scores.

Note: Dimension of the dataset is 100x2 (Its a matrix). The alignment function is made such that it will return a list (2d array)(Its a list). This is useful as we can use the same function for extracting features of training examples, testing examples, and for a new sentence .

In [26]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
import pickle
from collections import defaultdict


In [27]:
np.random.seed(1)
# will remove in the development mode.

In [122]:
class SVMClassifier:
    def __init__(self):
        self.tag_map = defaultdict(lambda : wn.NOUN)
        self.tag_map['J'] = wn.ADJ
        self.tag_map['V'] = wn.VERB
        self.tag_map['R'] = wn.ADV
        self.ideal_answer = None
        self.model = None
        self.score_dictionary = defaultdict(int)
        
    def loadDataset(self, file):
        dataset = pd.read_csv(file, encoding="latin-1")
        return dataset
    
    def preprocessing(self, data):
        preProcessedDataset = []
        data = [entry.lower() for entry in data]
        data = [word_tokenize(entry) for entry in data]
        for index,entry in enumerate(data):
            Final_words = []
            word_Lemmatized = WordNetLemmatizer()
            for word, tag in pos_tag(entry):
                if word not in stopwords.words('english') and word.isalpha():
                    word_Final = word_Lemmatized.lemmatize(word,self.tag_map[tag[0]])
                    Final_words.append(word_Final)
            preProcessedDataset.append(Final_words)
        return preProcessedDataset
    
    def split(self, preProcessedDataset,Corpus):
        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(preProcessedDataset,Corpus['label'],test_size=0.25)
        return Train_X, Test_X, Train_Y, Test_Y
    
    def initialize_ideal_answer(self, X):
        self.ideal_answer = X[0]
        print("ideal answer is = ", self.ideal_answer)

    def encode_y(self, Train_Y, Test_Y):
        Encoder = LabelEncoder()
        Train_Y = Encoder.fit_transform(Train_Y)
        Test_Y = Encoder.fit_transform(Test_Y)
        return Train_Y, Test_Y
    
    def word_overlap_score(self, Train_X, ideal_answer):
        features = []
        for example in Train_X:
            intersection = set(ideal_answer).intersection(set(example)) 
            score = len(intersection)/len(set(ideal_answer))
            features.append(score)
        return features
        
    #function for extracting features
    def alignment(self, Train_X, ideal_answer):
        if ideal_answer is None:
            ideal_answer = self.ideal_answer
        features = self.word_overlap_score(Train_X, ideal_answer)
        return (np.array(features)).reshape(-1,1)
    
    def get_params(self):
        C=1.0
        kernel='linear'
        degree=3
        gamma='auto'
        probability=True
        return C,kernel, degree, gamma, probability
    
    def set_params(self, **params):
        self.model = svm.SVC(C = params['C'], kernel = params['kernel'], degree = params['degree'], gamma = params['gamma'], probability = params['probability'] )
    
    def train(self, trainFeatures, Train_Y):
        self.model.fit(trainFeatures, Train_Y)
        print("Triaining complete")
        
    def predict(self, testFeatures):
        return self.model.predict(testFeatures)
    
    def score(self, model_predictions, Test_Y):
        return accuracy_score(model_predictions, Test_Y)*100
    
    def save(self, filename):
        pickle.dump(self.model, open(filename, 'wb'))
        print("Model saved successfully!")
        
    def load(self, filename):
        model = pickle.load(open(filename, 'rb'))
        return model

    def confidence_score(self, sentence, expectation_number):
        if expectation_number == None:
            pass
        else:
            model = self.load("model" + str(expectation_number))
            self.score_dictionary[expectation_number] = [model.predict(sentence)[0], model.decision_function(sentence)[0]]
            return self.score_dictionary
  

In [93]:
expectation1 = SVMClassifier()
Corpus = expectation1.loadDataset('exp1_dataset.csv')
print(Corpus)

                                                text        label
0   Stuning even for the non-gamer: This sound tr...  __label__1 
1   The best soundtrack ever to anything.: I'm re...  __label__2 
2   Amazing!: This soundtrack is my favorite musi...  __label__2 
3   Excellent Soundtrack: I truly like this sound...  __label__1 
4   Remember, Pull Your Jaw Off The Floor After H...  __label__2 
5   an absolute masterpiece: I am quite sure any ...  __label__1 
6   Buyer beware: This is a self-published book, ...  __label__1 
7   Glorious story: I loved Whisper of the wicked...  __label__1 
8   A FIVE STAR BOOK: I just finished reading Whi...  __label__2 
9   Whispers of the Wicked Saints: This was a eas...  __label__1 


In [94]:
preProcessedDataset = expectation1.preprocessing(Corpus['text'])

In [95]:
Train_X, Test_X, Train_Y, Test_Y = expectation1.split(preProcessedDataset, Corpus)


In [96]:
expectation1.initialize_ideal_answer(Train_X)
Train_Y, Test_Y = expectation1.encode_y(Train_Y, Test_Y)
features = expectation1.alignment(Train_X, None)

ideal answer is =  ['buyer', 'beware', 'book', 'want', 'know', 'read', 'paragraph', 'star', 'review', 'must', 'write', 'haddon', 'family', 'friend', 'perhaps', 'ca', 'imagine', 'anyone', 'read', 'whole', 'thing', 'spend', 'evening', 'book', 'friend', 'hysteric', 'read', 'bit', 'piece', 'one', 'another', 'definitely', 'bad', 'enough', 'enter', 'kind', 'bad', 'book', 'contest', 'ca', 'believe', 'amazon', 'even', 'sell', 'kind', 'thing', 'maybe', 'offer', 'grade', 'term', 'paper', 'kill', 'mockingbird', 'book', 'quite', 'sure', 'haddon', 'never', 'heard', 'anyway', 'unless', 'mood', 'send', 'book', 'someone', 'joke', 'far', 'far', 'away', 'one']


In [97]:
C,kernel, degree, gamma, probability = expectation1.get_params()
model = expectation1.set_params(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)
expectation1.train(features, Train_Y)
train_pred = expectation1.predict(features)
print(train_pred)
print(expectation1.score(train_pred, Train_Y))

Triaining complete
[0 0 0 0 0 0 0]
57.14285714285714


In [98]:
testFeatures = expectation1.alignment(Test_X, None)
model_predictions = expectation1.predict(testFeatures)
print(model_predictions)
accuracy = expectation1.score(model_predictions, Test_Y)
print("Accuracy of the model: ",accuracy)

[0 0 0]
Accuracy of the model:  66.66666666666666


In [99]:
expectation1.save('model1')

Model saved successfully!


In [100]:
model1 = expectation1.load('model1')

In [101]:
sentence = ['unfortunately entertain least Awful beyond belief!: I feel I have to write to keep others from wasting their money. This book seems to have been written by a 7th grader with poor grammatical skills for her age! As another reviewer points out, there is a misspelling on the cover, and I believe there is at least one per chapter. For example, it was mentioned twice that she had a "lean" on her house. I was so distracted by the poor writing and weak plot, that I decided to read with a pencil in hand to mark all of the horrible grammar and spelling. Please dont waste your money. I too, believe that the good reviews must have been written by the authors relatives. I will not put much faith in the reviews from now on!']
sent_proc = expectation1.preprocessing(sentence)
sent_features = expectation1.alignment(sent_proc, None)

In [102]:
confidence_score = model1.decision_function(sent_features)
print(confidence_score)

[-1.]


**Second expectation**

In [123]:
expectation2 = SVMClassifier()
Corpus = expectation2.loadDataset('exp2_dataset.csv')
print(Corpus)

                                                text        label
0   The Worst!: A complete waste of time. Typogra...  __label__1 
1   Great book: This was a great book,I just coul...  __label__2 
2   Great Read: I thought this book was brilliant...  __label__2 
3   Oh please: I guess you have to be a romance n...  __label__1 
4   Awful beyond belief!: I feel I have to write ...  __label__1 
5   Don't try to fool us with fake reviews.: It's...  __label__1 
6   A romantic zen baseball comedy: When you hear...  __label__2 
7   Fashionable Compression Stockings!: After I h...  __label__2 
8   Jobst UltraSheer Thigh High: Excellent produc...  __label__2 
9   sizes recomended in the size chart are not re...  __label__1 


In [124]:
preProcessedDataset = expectation2.preprocessing(Corpus['text'])

In [125]:
Train_X, Test_X, Train_Y, Test_Y = expectation2.split(preProcessedDataset, Corpus)


In [126]:
expectation2.initialize_ideal_answer(Train_X)
Train_Y, Test_Y = expectation2.encode_y(Train_Y, Test_Y)
features = expectation2.alignment(Train_X, None)

ideal answer is =  ['great', 'read', 'think', 'book', 'brilliant', 'yet', 'realistic', 'show', 'error', 'human', 'love', 'fact', 'writer', 'show', 'loving', 'side', 'god', 'revengeful', 'side', 'love', 'twist', 'turn', 'could', 'put', 'also', 'love', 'glass', 'castle']


In [127]:
C,kernel, degree, gamma, probability = expectation2.get_params()
model = expectation2.set_params(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)
expectation2.train(features, Train_Y)
train_pred = expectation2.predict(features)
print(train_pred)
print(expectation2.score(train_pred, Train_Y))

Triaining complete
[1 0 0 0 0 0 0]
71.42857142857143


In [128]:
testFeatures = expectation2.alignment(Test_X, None)
model_predictions = expectation2.predict(testFeatures)
print(model_predictions)
accuracy = expectation2.score(model_predictions, Test_Y)
print("Accuracy of the model: ",accuracy)

[0 0 0]
Accuracy of the model:  33.33333333333333


In [129]:
expectation2.save('model2')

Model saved successfully!


In [130]:
model2 = expectation2.load('model2')

In [131]:
sentence = ['unfortunately entertain least Awful beyond belief!: I feel I have to write to keep others from wasting their money. This book seems to have been written by a 7th grader with poor grammatical skills for her age! As another reviewer points out, there is a misspelling on the cover, and I believe there is at least one per chapter. For example, it was mentioned twice that she had a "lean" on her house. I was so distracted by the poor writing and weak plot, that I decided to read with a pencil in hand to mark all of the horrible grammar and spelling. Please dont waste your money. I too, believe that the good reviews must have been written by the authors relatives. I will not put much faith in the reviews from now on!']
sent_proc = expectation2.preprocessing(sentence)
sent_features = expectation2.alignment(sent_proc, None)

In [132]:
print(sent_features)

[[0.125]]


In [135]:
expectation_number = 1

In [138]:
ans = expectation1.confidence_score(sent_features, 2)
print(ans)



defaultdict(<class 'int'>, {2: [0, -0.9062500010477379]})
