### 5.2.1 Validation

In [170]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

from sklearn.model_selection import train_test_split

In [171]:
raw_data = pd.read_csv('IMDB-Dataset.csv')

In [172]:
# iloc : row, col
review_docs = raw_data.iloc[:, 0]

In [173]:
senti = raw_data.iloc[:, 1]

In [174]:
tfidf = TfidfVectorizer(min_df = 20, max_df = 30000).fit_transform(review_docs)
tfidf

<50000x17890 sparse matrix of type '<class 'numpy.float64'>'
	with 5894889 stored elements in Compressed Sparse Row format>

In [175]:
# train_size=0.8 : want to set 80% of data for training
# shuffle=True : important when dataset has last instances as of same level i.e. when dataset is ordered, doesn't give good chance to verify performance of model, mixes instances
training_docs, testing_docs, training_senti, testing_senti = train_test_split(tfidf, senti, train_size = 0.8, shuffle=True)

# tfidf

In [176]:
training_docs, len(training_senti)

(<40000x17890 sparse matrix of type '<class 'numpy.float64'>'
 	with 4723243 stored elements in Compressed Sparse Row format>,
 40000)

In [177]:
# clf = MultinomialNB().fit(tfidf[:-10], senti[:-10])
clf = MultinomialNB().fit(training_docs, training_senti)

In [178]:
# clf.predict(tfidf[-10:])
predicted_senti = clf.predict(testing_docs)

In [179]:
from sklearn.metrics import accuracy_score

In [180]:
acc = accuracy_score(testing_senti, predicted_senti)

In [182]:
print('Accuracy score is:',acc)

Accuracy score is: 0.8601


### 5.2.2 K-fold Cross Validation

In [52]:
from sklearn.model_selection import KFold

In [74]:
# 5-fold cross validation, dataset to be split in 5 parts, process will repeat 5 times
kf = KFold(n_splits = 5)

# x = [1,2,3,4,5,6]
# labels = [0,0,0,1,1,1]

acc = []

# loop will run 5 times
for train_ids, test_ids in kf.split(tfidf, senti):
    
    train_docs, test_docs = tfidf[train_ids], tfidf[test_ids]
    train_senti, test_senti = senti[train_ids], senti[test_ids]
    
    clf = MultinomialNB().fit(train_docs, train_senti)
    
    predicted_senti = clf.predict(test_docs)
    
    acc.append(accuracy_score(test_senti, predicted_senti))

In [76]:
total_acc = sum(acc)/len(acc)
print('Accuracy score:',total_acc)

Accuracy score: 0.8603


### 5.2.3 Leave-one-out Validation

In [77]:
from sklearn.model_selection import LeaveOneOut

In [93]:
# automatically does N split so n_splits is not set
# we use when we don't have enough data to train the model
loo = LeaveOneOut()

x = [1,2,3,4,5,6,7]
labels = [0,0,0,1,1,1,1]

acc = []

# loop will run N times
# will take too much time when run on 50k dataset
for train_ids, test_ids in loo.split(x):
    
    print('train ids:',train_ids, 'test ids:',test_ids)
    
#     train_docs, test_docs = tfidf[train_ids], tfidf[test_ids]
#     train_senti, test_senti = senti[train_ids], senti[test_ids]
    
#     clf = MultinomialNB().fit(train_docs, train_senti)
    
#     predicted_senti = clf.predict(test_docs)
    
#     acc.append(accuracy_score(test_senti, predicted_senti))

train ids: [1 2 3 4 5 6] test ids: [0]
train ids: [0 2 3 4 5 6] test ids: [1]
train ids: [0 1 3 4 5 6] test ids: [2]
train ids: [0 1 2 4 5 6] test ids: [3]
train ids: [0 1 2 3 5 6] test ids: [4]
train ids: [0 1 2 3 4 6] test ids: [5]
train ids: [0 1 2 3 4 5] test ids: [6]


### 5.2.4 Predictive Accuracy of KNN using KFold

In [204]:
corpus = open('dataset3.txt').read()

In [205]:
docs_list = corpus.split('\n')

In [206]:
docs = []
for i in docs_list:
    if i != '':
        docs.append(i)

In [216]:
x, y = [], []

for doc in docs:
    i, l = doc.split(':')
    x.append(i.strip())
    y.append(l.strip())

In [152]:
from sklearn.feature_extraction.text import CountVectorizer

In [153]:
vec = CountVectorizer()

In [154]:
matrix_x = vec.fit_transform(x)

In [155]:
from sklearn.model_selection import KFold

In [156]:
kf = KFold(n_splits = 3)

In [157]:
from sklearn.neighbors import KNeighborsClassifier

In [158]:
knn = KNeighborsClassifier(n_neighbors=1)

In [159]:
import numpy as np

y = np.array(y)

In [169]:
# apply validation and get split
score = 0

from sklearn.metrics import accuracy_score

for train_ids, test_ids in kf.split(matrix_x):
    train_x, test_x = matrix_x[train_ids], matrix_x[test_ids]
    train_y, test_y = y[train_ids], y[test_ids]
    
    knn.fit(train_x, train_y)
    
    predicted_y = knn.predict(test_x)
    
    score += accuracy_score(test_y, predicted_y, normalize=True)
    
print(score/3)

1.0


### 5.2.5 Precision, Recall and F1-measure

In [183]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [197]:
# switch off normalization for accuracy score
# normalize=False : gives correct number of instances identifies
# normalize=True : correct no. of instances / total no. of instances

accuracy = accuracy_score(testing_senti, predicted_senti, normalize=True)
print('Accuracy score:',accuracy)


# average='macro' : calculate precision for each label separately and avg the two, use when almost equal no. of representation of all the labels in dataset
# average='micro' : average the precision at the instance level for all labels
# average='weighted' : weights are assigned based on the instances

precision = precision_score(testing_senti, predicted_senti, average='weighted')
print('Precision score:',precision)


recall = recall_score(testing_senti, predicted_senti, average='weighted')
print('Recall score:',recall)


f1 = f1_score(testing_senti, predicted_senti, average='weighted')
print('F1 score:',f1)

# average scheme should be same for all the models so we can pick a model with better score

Accuracy score: 0.8601
Precision score: 0.8604386139744583
Recall score: 0.8601
F1 score: 0.8600624745840181


### 5.2.6 Confusion Matrix

In [198]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [404]:
# 4387 correctly classified
# 622 misclassfied as -ve instances, were actually +ve
# 777 wrongly classified as +ve, were actually -ve
# 4214 correctly classified

cm = confusion_matrix(testing_senti, predicted_senti)

In [405]:
cm

array([[4387,  622],
       [ 777, 4214]])

### 5.2.7 Putting it all together

In [222]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [223]:
kf = KFold(n_splits = 4)
nb = MultinomialNB()
dt = DecisionTreeClassifier(max_depth = 3)

In [224]:
corpus = open('dataset3.txt').read()

In [225]:
docs_list = corpus.split('\n')

In [226]:
docs = []
for i in docs_list:
    if i != '':
        docs.append(i)

In [227]:
x, y = [], []

for doc in docs:
    i, l = doc.split(':')
    x.append(i.strip())
    y.append(l.strip())

In [228]:
from sklearn.feature_extraction.text import CountVectorizer

In [229]:
vec = CountVectorizer()

In [230]:
matrix_x = vec.fit_transform(x)

In [231]:
import numpy as np

y = np.array(y)

In [234]:
f1_nb_score = 0
f1_dt_score = 0

for train_ids, test_ids in kf.split(matrix_x):
    train_x, test_x = matrix_x[train_ids], matrix_x[test_ids]
    train_y, test_y = y[train_ids], y[test_ids]
    
    nb.fit(train_x, train_y)
    dt.fit(train_x, train_y)
    
    predicted_nb_y = nb.predict(test_x)
    predicted_dt_y = dt.predict(test_x)
    
    f1_nb_score += f1_score(test_y, predicted_nb_y, average='micro')
    f1_dt_score += f1_score(test_y, predicted_dt_y, average='micro')
    
print('NB score:',f1_nb_score/4)
print('DT score:',f1_dt_score/4)

# NB has scored better

NB score: 1.0
DT score: 0.875


### 5.2.8 Implementing Clustering Evaluation

In [236]:
corpus = open('dataset2.csv', encoding='latin-1').read()

In [241]:
docs = corpus.split('\n')
docs.remove(docs[0])

In [244]:
from sklearn.feature_extraction.text import CountVectorizer

In [245]:
vec = CountVectorizer()

In [246]:
matrix_x = vec.fit_transform(docs)
matrix_x

<652x2457 sparse matrix of type '<class 'numpy.int64'>'
	with 14788 stored elements in Compressed Sparse Row format>

In [247]:
from sklearn.cluster import KMeans

In [248]:
km = KMeans(n_clusters=2)

In [249]:
km.fit(matrix_x)

KMeans(n_clusters=2)

In [257]:
from sklearn.metrics import davies_bouldin_score, silhouette_score

In [261]:
# input data should be in array so matrix_x.toarray() or numpy array
# labels are the list of attributes that are assigned by classifier
# lower value : objects/docs are close to its centroid, are being clustered well

print(davies_bouldin_score(matrix_x.toarray(), km.labels_))

# range [-1, 1] 
print(silhouette_score(matrix_x.toarray(), km.labels_))

1.538962591186229
0.5333093440770192


### Assignment

In [263]:
corpus = open('sentiDataset.txt').read()

In [285]:
docs_list = corpus.split('\n')

In [340]:
# to get Rating and Reviews in a separate list
ratings = []
reviews = []

for document in docs_list:

    d = document.split('\t')
    ratings.append(d[2])    
    reviews.append(d[3])

In [267]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [352]:
tfidf = TfidfVectorizer(max_features=200, ngram_range=(1,3)).fit_transform(reviews)

In [281]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

In [357]:
knn1 = KNeighborsClassifier(n_neighbors = 1, weights = 'distance')
knn2 = KNeighborsClassifier(n_neighbors = 2, algorithm = 'auto')

In [358]:
dt1 = DecisionTreeClassifier(max_depth = 2)
dt2 = DecisionTreeClassifier(max_depth = 3)

In [359]:
nb = MultinomialNB()

In [368]:
from sklearn.model_selection import KFold

In [369]:
kf = KFold(n_splits = 5)

In [370]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [372]:
import numpy as np
ratings = np.array(ratings)

In [406]:
p_score1, p_score2, p_score3, p_score4, p_score5 = [],[],[],[],[]
r_score1, r_score2, r_score3, r_score4, r_score5 = [],[],[],[],[]
f1_score1, f1_score2, f1_score3, f1_score4, f1_score5 = [],[],[],[],[]

for train_ids, test_ids in kf.split(tfidf, ratings):
    train_reviews, test_reviews = tfidf[train_ids], tfidf[test_ids]
    train_ratings, test_ratings = ratings[train_ids], ratings[test_ids]
    
    k1 = knn1.fit(train_reviews, train_ratings)
    k2 = knn2.fit(train_reviews, train_ratings)
    
    d1 = dt1.fit(train_reviews, train_ratings)
    d2 = dt2.fit(train_reviews, train_ratings)
    
    n = nb.fit(train_reviews, train_ratings)
    
    predicted_ratings1 = k1.predict(test_reviews)
    predicted_ratings2 = k2.predict(test_reviews)
    predicted_ratings3 = d1.predict(test_reviews)
    predicted_ratings4 = d2.predict(test_reviews)
    predicted_ratings5 = n.predict(test_reviews)
    
    p_score1.append(precision_score(test_ratings, predicted_ratings1, average='weighted'))
    r_score1.append(recall_score(test_ratings, predicted_ratings1, average='weighted'))
    f1_score1.append(f1_score(test_ratings, predicted_ratings1, average='micro'))
    cm1 = confusion_matrix(test_ratings, predicted_ratings1)
    
    p_score2.append(precision_score(test_ratings, predicted_ratings2, average='weighted'))
    r_score2.append(recall_score(test_ratings, predicted_ratings2, average='weighted'))
    f1_score2.append(f1_score(test_ratings, predicted_ratings2, average='micro'))
    cm2 = confusion_matrix(test_ratings, predicted_ratings2)
    
    p_score3.append(precision_score(test_ratings, predicted_ratings3, average='weighted'))
    r_score3.append(recall_score(test_ratings, predicted_ratings3, average='weighted'))
    f1_score3.append(f1_score(test_ratings, predicted_ratings3, average='micro'))
    cm3 = confusion_matrix(test_ratings, predicted_ratings3)
    
    p_score4.append(precision_score(test_ratings, predicted_ratings4, average='weighted'))
    r_score4.append(recall_score(test_ratings, predicted_ratings4, average='weighted'))
    f1_score4.append(f1_score(test_ratings, predicted_ratings4, average='micro'))
    cm4 = confusion_matrix(test_ratings, predicted_ratings4)
    
    p_score5.append(precision_score(test_ratings, predicted_ratings5, average='weighted'))
    r_score5.append(recall_score(test_ratings, predicted_ratings5, average='weighted'))
    f1_score5.append(f1_score(test_ratings, predicted_ratings5, average='micro'))
    cm5 = confusion_matrix(test_ratings, predicted_ratings5)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [408]:
print('Scores for knn1 --- ','Precision:',sum(p_score1)/len(p_score1), 'Recall:',sum(r_score1)/len(r_score1),'F1:',sum(f1_score1)/len(f1_score1))
print('Scores for knn2 --- ','Precision:',sum(p_score2)/len(p_score2), 'Recall:',sum(r_score2)/len(r_score2),'F1:',sum(f1_score2)/len(f1_score2))
print('Scores for dt1  --- ','Precision:',sum(p_score3)/len(p_score3), '       Recall:',sum(r_score3)/len(r_score3),'F1:',sum(f1_score3)/len(f1_score3))
print('Scores for dt2  --- ','Precision:',sum(p_score4)/len(p_score4), ' Recall:',sum(r_score4)/len(r_score4),' F1:',sum(f1_score4)/len(f1_score4))
print('Scores for nb   --- ','Precision:',sum(p_score5)/len(p_score5), ' Recall:',sum(r_score5)/len(r_score5),' F1:',sum(f1_score5)/len(f1_score5))

Scores for knn1 ---  Precision: 0.41496297851160635 Recall: 0.40515 F1: 0.40515
Scores for knn2 ---  Precision: 0.43362257399079773 Recall: 0.32415 F1: 0.32415
Scores for dt1  ---  Precision: 0.3059755125        Recall: 0.55275 F1: 0.55275
Scores for dt2  ---  Precision: 0.3138672879089306  Recall: 0.5516  F1: 0.5516
Scores for nb   ---  Precision: 0.3344454708777209  Recall: 0.5529  F1: 0.5529


In [None]:
# nb model is doing better than other models as it has the highest F1 and Recall score.

In [414]:
print('Confusion matrix for knn1:\n',cm1,'\n')
print('Confusion matrix for knn2:\n',cm2,'\n')
print('Confusion matrix for dt1:\n',cm3,'\n')
print('Confusion matrix for dt2:\n',cm4,'\n')
print('Confusion matrix for nb:\n',cm5,'\n')

Confusion matrix for knn1:
 [[  66   27   41  116  150]
 [  34    7   40   84   90]
 [  26   19   41   89  130]
 [  75   31   72  220  335]
 [ 167   70  172  605 1293]] 

Confusion matrix for knn2:
 [[105  36  59 114  86]
 [ 53  20  55  84  43]
 [ 56  29  62  94  64]
 [129  56 110 236 202]
 [305 122 285 758 837]] 

Confusion matrix for dt1:
 [[   0    0    0    0  400]
 [   0    0    0    0  255]
 [   0    0    0    0  305]
 [   0    0    0    0  733]
 [   0    0    0    0 2307]] 

Confusion matrix for dt2:
 [[   0    0    0    0  400]
 [   0    0    0    0  255]
 [   0    0    0    0  305]
 [   0    0    0    0  733]
 [   0    0    0    0 2307]] 

Confusion matrix for nb:
 [[   6    0    0    0  394]
 [   3    0    0    0  252]
 [   0    0    0    0  305]
 [   1    0    0    0  732]
 [   5    0    0    0 2302]] 

