# Functions working:
-     loadDataset: load the dataset 
-     preprocessing: convert into lower cases + tokenization + lemmatization
-     split: split dataset into 75% training and 25% testing.
-     initialize_ideal_answer: define ideal_answer
-     encode_y = coverting y lables(categorical) to numbers(0/1)
-     word_overlap_score: find similarity by comparing Training examples with an ideal answer
-     alignment: returns features, which includes scores for each training examples.
-     get_params: define parameters for the model.
-     set_params: set the parameters of the model.
-     trian: train the model
-     predict: performance on the testing dataset.
-     score: finding accuracy
-     save: save the model locally
-     load: load the model from local
-     predict_probabilities: find confidence scores.

Note: Dimension of the dataset is 100x2 (Its a matrix). The alignment function is made such that it will return a list (2d array)(Its a list). This is useful as we can use the same function for extracting features of training examples, testing examples, and for a new sentence .

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
import pickle

In [2]:
np.random.seed(1)
# will remove in the development mode.

In [3]:
class SVMClassifier:
    def __init__(self):
        self.tag_map = defaultdict(lambda : wn.NOUN)
        self.tag_map['J'] = wn.ADJ
        self.tag_map['V'] = wn.VERB
        self.tag_map['R'] = wn.ADV
        self.ideal_answer = None
        self.model = None
        
    def loadDataset(self, file):
        dataset = pd.read_csv(file, encoding="latin-1")
        return dataset
    
    def preprocessing(self, data):
        preProcessedDataset = []
        data = [entry.lower() for entry in data]
        data = [word_tokenize(entry) for entry in data]
        for index,entry in enumerate(data):
            Final_words = []
            word_Lemmatized = WordNetLemmatizer()
            for word, tag in pos_tag(entry):
                if word not in stopwords.words('english') and word.isalpha():
                    word_Final = word_Lemmatized.lemmatize(word,self.tag_map[tag[0]])
                    Final_words.append(word_Final)
            preProcessedDataset.append(Final_words)
        return preProcessedDataset
    
    def split(self, preProcessedDataset,Corpus):
        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(preProcessedDataset,Corpus['label'],test_size=0.25)
        return Train_X, Test_X, Train_Y, Test_Y
    
    def initialize_ideal_answer(self, X):
        self.ideal_answer = X[0]

    def encode_y(self, Train_Y, Test_Y):
        Encoder = LabelEncoder()
        Train_Y = Encoder.fit_transform(Train_Y)
        Test_Y = Encoder.fit_transform(Test_Y)
        return Train_Y, Test_Y
    
    def word_overlap_score(self, Train_X, ideal_answer):
        features = []
        for example in Train_X:
            intersection = set(ideal_answer).intersection(set(example)) 
            score = len(intersection)/len(set(ideal_answer))
            features.append(score)
        return features
        
    #function for extracting features
    def alignment(self, Train_X, ideal_answer):
        if ideal_answer is None:
            ideal_answer = self.ideal_answer
        features = self.word_overlap_score(Train_X, ideal_answer)
        return (np.array(features)).reshape(-1,1)
    
    def get_params(self):
        C=1.0
        kernel='linear'
        degree=3
        gamma='auto'
        probability=True
        return C,kernel, degree, gamma, probability
    
    def set_params(self, **params):
        self.model = svm.SVC(C = params['C'], kernel = params['kernel'], degree = params['degree'], gamma = params['gamma'], probability = params['probability'] )
    
    def train(self, trainFeatures, Train_Y):
        self.model.fit(trainFeatures, Train_Y)
        print("Triaining complete")
        
    def predict(self, testFeatures):
        return self.model.predict(testFeatures)
    
    def score(self, model_predictions, Test_Y):
        return accuracy_score(model_predictions, Test_Y)*100
    
    def save(self, filename):
        pickle.dump(self.model, open(filename, 'wb'))
        print("Model saved successfully!")
        
    def load(self, filename):
        model = pickle.load(open(filename, 'rb'))
        return model

    def predict_probabilities(self, sentence):
        return self.model.predict_proba(sentence)
  

In [4]:
obj = SVMClassifier()
Corpus = obj.loadDataset('dataset.csv')  

In [5]:
preProcessedDataset = obj.preprocessing(Corpus['text'])

In [6]:
Train_X, Test_X, Train_Y, Test_Y = obj.split(preProcessedDataset, Corpus)

In [7]:
obj.initialize_ideal_answer(Train_X)
Train_Y, Test_Y = obj.encode_y(Train_Y, Test_Y)
features = obj.alignment(Train_X, None)

In [8]:
C,kernel, degree, gamma, probability = obj.get_params()
obj.set_params(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)
obj.train(features, Train_Y)

Triaining complete


In [9]:
testFeatures = obj.alignment(Test_X, None)
model_predictions = obj.predict(testFeatures)
accuracy = obj.score(model_predictions, Test_Y)
print("Accuracy of the model: ",accuracy)

Accuracy of the model:  56.00000000000001


In [10]:
obj.save('model')

Model saved successfully!


In [11]:
model = obj.load('model')

In [12]:
sentence = [' More great playing: Larrys work for the Muse label in the late 80s and early 90s found him exploring the rich catalog of jazz standards that he probably grew up playing. Surrounding himself with the best musicians around, Larry sounds relaxed, focused, and totally in control. Listen to Larry take on My Funny Valentine and tell me it does not stand right up there with Chet Baker or Miles. A very fine album that you will enjoy again and again.']
sent_proc = obj.preprocessing(sentence)
sent_features = obj.alignment(sent_proc, None)


In [13]:
confidence = obj.predict_probabilities(sent_features)
print("Confidence Score for Class 1 = ", confidence[0][0])
print("Confidence Score for Class 2 = ", confidence[0][1])

Confidence Score for Class 1 =  0.4393982033303043
Confidence Score for Class 2 =  0.5606017966696957
