In [1]:
## Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sqlite3    ## SQL Interface
import pickle     ## Used to save your data - Converts objects to byte stream and vice versa

from sklearn.feature_extraction.text import CountVectorizer  ## BOW Model
from sklearn.feature_extraction.text import TfidfVectorizer  ## TFIDF Model

import gensim    ## To build Word2Vec model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression



In [2]:
from sklearn.naive_bayes import MultinomialNB    #Multinomial Naive - Bayes

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import ListedColormap
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


In [4]:
conn = sqlite3.connect('final.sqlite')  #Loading the sqlite file for future use
final = pd.read_sql_query("""SELECT * FROM Reviews""", conn)
conn.close()
final.drop(['index'],axis=1,inplace = True)
final.head()

Unnamed: 0,SEM 1 SGPA,SEM 1 KT,SEM 2 SGPA,SEM 2 KT,SEM 3 SGPA,SEM 3 KT,SEM 4 SGPA,SEM 4 KT,SEM 5 SGPA,SEM 5 KT,...,2_hrs_lect,Submissions,5_hrs_lect,5_hrs_pracs,Coaching_classes,Teacher's Feedback,Label,Scocial_Skills,Average pointer,Cleaned_Feedback
0,7.1,0,6.85,0,7.2,1.0,7.3,0.0,7.6,0,...,0,0,0,0,0,Disciplined and hard working,1,0,7.13125,disciplin hard work
1,7.2,0,7.1,0,6.11,0.0,6.67,0.0,7.14,0,...,1,1,0,0,1,Good leadership skills,1,1,7.05375,good leadership skill
2,3.8,4,4.2,2,4.2,1.0,3.8,2.0,4.35,0,...,0,1,0,0,0,Very talkative and Poor attendance,0,0,4.05875,talkat poor attend
3,6.91,1,6.6,2,5.82,1.0,6.1,0.0,7.71,0,...,0,1,0,1,0,Respectful to Authority and Others,1,0,6.505,respect author other
4,3.5,1,3.4,2,4.0,2.0,4.1,2.0,3.63,0,...,0,0,0,0,0,Bunk lectuer,0,0,3.815,bunk lectuer


In [7]:
final.shape

(262, 32)

In [28]:
final.columns

Index(['SEM 1 SGPA', 'SEM 1 KT', 'SEM 2 SGPA', 'SEM 2 KT', 'SEM 3 SGPA',
       'SEM 3 KT', 'SEM 4 SGPA', 'SEM 4 KT', 'SEM 5 SGPA', 'SEM 5 KT',
       'SEM 6 SGPA', 'SEM 6 KT', 'SEM 7 SGPA', 'SEM 7 KT', 'SEM 8 SGPA',
       'Hours_On_Assignment', 'Hours_On_Studies', 'Travel_Time', 'Attendance',
       'Internet_Availability', 'Internet_Speed', 'Mode_Of_Transportation',
       '2_hrs_lect', 'Submissions', '5_hrs_lect', '5_hrs_pracs',
       'Coaching_classes', 'Teacher's Feedback', 'Label', 'Scocial_Skills',
       'Average pointer', 'Cleaned_Feedback'],
      dtype='object')

In [5]:
bow_vect = CountVectorizer()
bow = bow_vect.fit_transform(final["Cleaned_Feedback"].values)

In [6]:
bow.shape

(262, 40)

In [9]:
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(final["Cleaned_Feedback"].values)
tf_idf.shape

(262, 40)

In [10]:
with open('list_of_sent_for_input_to_w2v.pkl','rb') as pickle_file:  #reading the pickle file saved earlier
    list_of_sent = pickle.load(pickle_file)

In [11]:
w2v_model = gensim.models.Word2Vec(list_of_sent, min_count = 5, size = 50, workers = -1) #Creating w2v model

In [12]:
words = list(w2v_model.wv.vocab)
print(len(words))

39


In [13]:
'''
This is a function to calculate average w2v
'''
def cal_avg_w2v(list_of_sent, w2v_model):
    sent_vectors = []   #Creating an empty list
    for sent in list_of_sent:
        sent_vec = np.zeros(50)  #Initializing the sentence vector to 0 
        cnt_words = 0
        for word in sent:
            try:
                vec = w2v_model.wv[word]  #Calculating the word vector using w2v
                sent_vec += vec   #Add the word vector into sentence vector
                cnt_words +=1     #Sum of all the word counts
            except:
                pass
        sent_vec /= cnt_words     #To find the sentence vector
        sent_vectors.append(sent_vec)  #Adding sentence vector to final list
    return sent_vectors

In [14]:
w2v_model = gensim.models.Word2Vec(list_of_sent, min_count = 5, size = 50, workers = 4)

In [15]:
sent_vectors = cal_avg_w2v(list_of_sent, w2v_model)  #calling the above function

  app.launch_new_instance()


In [16]:
sent_vectors[0]

array([  2.30403415e-03,   2.99065219e-03,  -4.07876757e-04,
        -2.60470249e-03,   2.96582204e-03,  -5.68035788e-03,
        -1.27553971e-03,   1.08485044e-03,   6.24432244e-03,
         2.21919455e-03,   8.80717494e-04,  -5.45348429e-04,
        -3.34404642e-04,  -1.84056008e-03,   4.31437821e-04,
        -3.66837489e-03,  -2.84070460e-04,  -1.40320871e-03,
        -4.94369461e-04,   4.34208937e-03,   3.32159669e-03,
         1.28108117e-03,   1.92534660e-03,  -4.71431145e-03,
        -2.38795796e-03,  -8.33113270e-04,  -5.43230609e-03,
         1.28510781e-03,   4.76304915e-03,  -1.79629036e-03,
        -1.59521702e-03,  -1.34277623e-03,  -7.62412131e-04,
         5.45015753e-04,   4.20565453e-03,   3.90751435e-03,
         2.54688707e-03,   4.48280751e-03,   1.32052880e-03,
        -3.03055998e-03,  -8.09190624e-03,  -6.57206401e-04,
        -3.31447880e-03,  -1.49026814e-03,  -1.86165748e-03,
         1.29965624e-03,  -2.51259189e-05,   1.05622504e-03,
        -2.46583282e-03,

In [17]:
with open('sent_vec_avg_w2v.pkl','wb') as pickle_file:  #Saving as pickle file
    pickle.dump(sent_vectors,pickle_file)

In [18]:
with open('sent_vec_avg_w2v.pkl','rb') as pickle_file:  #Loading for future use
    sent_vectors = pickle.load(pickle_file)

In [19]:
sent_vec_array= np.array(sent_vectors)    #Converting the list into numpy array

In [20]:
sent_vec_array = sent_vec_array[~np.isnan(sent_vec_array).any(axis=1)]   #Removing any NaN value if present in the numpy array

In [22]:
'''
This function calculates TF - IDF weighted average W2V
It takes in 4 parameters as follows:
       1. list_of_sent - This is the list of sentences/reviews for which sentence vetors are to be constructed
       2. w2v_model - This is the Word2Vec model which is trained on the working corpus - contains the word vectors
       3. tf_idf - This is the TF-IDF model built using the same reviews/sentences - it is the TF-IDF sparse matrix
       4. tfidf_feat - This is the feature vector constructed from the TF-IDF model
       
       Return Value:
       tfidf_sent_vectors - This is a list of sentence/review vectors constructed by using tfidf weighted average on the word vectors
'''
def cal_tfidf_avg_w2v(list_of_sent, w2v_model, tf_idf,tfidf_feat):
    tfidf_sent_vectors = []
    row = 0
    for sent in list_of_sent:
        sent_vec = np.zeros(50)
        weighted_sum = 0
        for word in sent:
            try:
                vec = w2v_model.wv[word]
                tfidf = tf_idf[row, tfidf_feat.index(word)]
                sent_vec += vec*tfidf
                weighted_sum += tfidf
            except:
                pass
        print(row, weighted_sum)
        sent_vec /= weighted_sum
        tfidf_sent_vectors.append(sent_vec)
        row +=1
    return tfidf_sent_vectors

In [23]:
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(final['Cleaned_Feedback'].values)
tfidf_feat = tf_idf_vect.get_feature_names()
w2v_model = gensim.models.Word2Vec(list_of_sent, min_count = 5, size = 50, workers = -1)

In [24]:
tfidf_sent_vectors = cal_tfidf_avg_w2v(list_of_sent, w2v_model, tf_idf, tfidf_feat)  #calling the above function for 60k points. It prints the row and weighted sum just to check the progress.

0 1.73200075593
1 1.72538741739
2 1.72725051071
3 1.73205080757
4 1.41421356237
5 1.98249498453
6 1.41330405095
7 1.0
8 1.40643857653
9 1.41330405095
10 1.41421356237
11 1.73147774933
12 0
13 1.41330405095
14 1.73135637234
15 1.73200075593
16 1.41421356237
17 1.41421356237
18 1.99996673885
19 1.41330405095
20 1.73147774933
21 1.39275051067
22 1.99996673885
23 1.0
24 1.99226947089
25 1.41421356237
26 1.98249498453
27 1.41421356237
28 0
29 1.73205080757
30 1.99982939022
31 1.72538741739
32 1.99982939022
33 0
34 1.41330405095
35 1.41421356237
36 0
37 1.40643857653
38 1.41421356237
39 1.73200075593
40 1.40643857653
41 1.73205080757
42 1.72538741739
43 1.98249498453
44 1.99996673885
45 1.40643857653
46 1.73135637234
47 1.41330405095
48 1.41421356237
49 1.73200075593
50 1.72538741739
51 1.41421356237
52 1.73200075593
53 1.41421356237
54 1.41421356237
55 1.99226947089
56 1.99996673885
57 1.0
58 1.41421356237
59 1.0
60 1.0
61 1.73205080757
62 0
63 1.98249498453
64 1.41421356237
65 1.4133040509



In [25]:
with open('tfidf_sent_vectors.pkl','wb') as pickle_file:  #Saving as pickle file
    pickle.dump(sent_vectors,pickle_file)

In [26]:
with open('tfidf_sent_vectors.pkl','rb') as pickle_file:  #Loading the pickle file for future use
    sent_vectors = pickle.load(pickle_file)

In [27]:
tfidf_sent_vectors[0]

array([  2.24001398e-03,   3.05507950e-03,  -2.88507691e-04,
        -2.50458530e-03,   2.96255464e-03,  -5.66456518e-03,
        -1.30653611e-03,   1.10560605e-03,   6.22518173e-03,
         2.15100523e-03,   9.32937363e-04,  -5.84131636e-04,
        -2.32312924e-04,  -1.86356233e-03,   4.25988413e-04,
        -3.63399006e-03,  -3.58976974e-04,  -1.42175631e-03,
        -4.74543058e-04,   4.32783022e-03,   3.33544129e-03,
         1.20418174e-03,   1.96441605e-03,  -4.65783456e-03,
        -2.35986955e-03,  -8.27816608e-04,  -5.41105620e-03,
         1.15432328e-03,   4.66646050e-03,  -1.75244841e-03,
        -1.53787827e-03,  -1.29660815e-03,  -7.79387906e-04,
         5.77305110e-04,   4.12708066e-03,   3.84912091e-03,
         2.62544155e-03,   4.45707147e-03,   1.37847900e-03,
        -2.96967882e-03,  -7.99794138e-03,  -6.67480937e-04,
        -3.21507721e-03,  -1.55385273e-03,  -1.83397876e-03,
         1.33445688e-03,  -4.36830829e-05,   1.03907939e-03,
        -2.36499344e-03,

# KNN


In [57]:
X = final.iloc[:,:27].values
X.shape

(262, 27)

In [59]:
a = bow.toarray()
X = np.append(X,a, axis = 1)

In [75]:
X.shape

(262, 67)

In [77]:
Y = pd.DataFrame(X)

In [79]:
import yaml


Y['Label'] = yaml

In [80]:
Y.shape

(262, 68)

In [81]:
Y.dropna(axis = 0, inplace = True)

In [88]:
Y.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,Label
0,7.1,0.0,6.85,0.0,7.2,1.0,7.3,0.0,7.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,7.2,0.0,7.1,0.0,6.11,0.0,6.67,0.0,7.14,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,3.8,4.0,4.2,2.0,4.2,1.0,3.8,2.0,4.35,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,6.91,1.0,6.6,2.0,5.82,1.0,6.1,0.0,7.71,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,3.5,1.0,3.4,2.0,4.0,2.0,4.1,2.0,3.63,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [82]:
Y.shape

(251, 68)

In [92]:
X = Y.iloc[:,:67].values
y = Y['Label'].values

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = False) #Splitting X and Y as 70 % training and 30 % testing with shuffle set to false

In [96]:
#Column Standardization
s = StandardScaler()
X = s.fit_transform(X)
X[:5,:]

array([[ 0.63075355, -0.84681912,  0.48494022, -0.93615748,  0.77631603,
         0.19151807,  0.77955707, -0.64285537,  0.87674287, -0.06324555,
         0.48795271, -0.06324555,  0.62135746,  0.        ,  0.50470883,
         3.07544035, -0.33259505, -0.58066957, -0.31525441,  0.1928473 ,
         0.        , -0.75612208, -0.62804812, -1.7648666 , -0.70288521,
        -0.82056891, -0.35434522, -0.15649216, -0.3099514 , -0.1928473 ,
        -0.20370021, -0.3099514 , -0.14256649, -0.25210974, -0.29424494,
        -0.15649216,  3.96652661, -0.21408721, -0.27794463, -0.22407411,
        -0.30216609, -0.30216609, -0.26953585, -0.36843806,  3.83242743,
        -0.23371318, -0.14256649, -0.15649216, -0.24304676, -0.21408721,
        -0.29424494, -0.30216609, -0.23371318, -0.3099514 , -0.34718254,
        -0.1928473 , -0.27794463, -0.3099514 , -0.20370021, -0.33993463,
        -0.22407411, -0.22407411, -0.06324555, -0.27794463, -0.3099514 ,
        -0.27794463,  3.83242743],
       [ 0.68653

In [97]:
#5 fold CV
neighbors = list(range(1,20,2))
cv_scores = []

for k in neighbors:
    print(k)
    knn = KNeighborsClassifier(n_neighbors = k)
    scores = cross_val_score(knn, X_train, y_train, cv = 5, scoring = 'accuracy')
    cv_scores.append(scores.mean())
    
MSE = [1 - x for x in cv_scores]

optimal_k = neighbors[MSE.index(min(MSE))]
print('\nThe optimal number of neighbors is %d.' % optimal_k)

1
3
5
7
9
11
13
15
17
19

The optimal number of neighbors is 1.


In [99]:
classifier = KNeighborsClassifier(n_neighbors = optimal_k)    ## Change the values of n_neighbors to the optimal value of k
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [100]:
Y_pred = classifier.predict(X_test)  

In [103]:
acc = accuracy_score(y_test, Y_pred, normalize=True) * float(100)   #Calculating accuracy
acc  

100.0

# Logistic Regression

In [105]:
# Values for the hyperparameter 'C':
tuned_params = [{'C': [0.0001,0.001,0.01,0.1,1,10,100,1000,10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = False)

In [106]:
# Grid Search
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [107]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
0.986842105263


In [108]:
clf = LogisticRegression(C = 0.0001)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)*float(100)
acc

98.68421052631578

In [111]:
confusion_matrix(y_test, y_pred).T

array([[25,  1],
       [ 0, 50]], dtype=int64)