<center>
    <H1> ROCCHIO CLASSIFIER </H1>
    <br>
======================================================================================================================<br>



## STEP 1: IMPORTING LIBRARIES

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer  
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import math 
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

## STEP 2: LOADING DATASET

Load the required dataset

In [2]:
#mails_dataset = pd.read_csv('Dataset/trial_spam.csv', encoding = 'latin-1')
mails_dataset = pd.read_csv('Dataset/spam.csv', encoding = 'latin-1')
mails_dataset.head()           #show first 5 rows

Unnamed: 0,sno,v1,v2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38
0,1,ham,"Go until jurong point, crazy.. Available only ...",,,,,,,,...,,,,,,,,,,
1,2,ham,Ok lar... Joking wif u oni...,,,,,,,,...,,,,,,,,,,
2,3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,,,,,...,,,,,,,,,,
3,4,ham,U dun say so early hor... U c already then say...,,,,,,,,...,,,,,,,,,,
4,5,ham,"Nah I don't think he goes to usf, he lives aro...",,,,,,,,...,,,,,,,,,,


## STEP 3: FEATURE SELECTION

Select the relevant features, important for mail classification. We can see that column Unamed are irrelevant for our classifier. Thus, we need to remove them. 

In [3]:
#drop undesirable columns
#drop_list = ['Unnamed: 3', 'Unnamed: 4','Unnamed: 5','Unnamed: 6']
mails_dataset.drop(mails_dataset.columns.difference(['sno','v1','v2']), axis = 1, inplace = True)

mails_dataset.head()

Unnamed: 0,sno,v1,v2
0,1,ham,"Go until jurong point, crazy.. Available only ..."
1,2,ham,Ok lar... Joking wif u oni...
2,3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,4,ham,U dun say so early hor... U c already then say...
4,5,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Rename the columns, to make it easy to read and manipulate
mails_dataset.rename(columns = {'sno': 'DocID', 'v1': 'Label', 'v2': 'Message'}, inplace = True)
mails_dataset.head()

Unnamed: 0,DocID,Label,Message
0,1,ham,"Go until jurong point, crazy.. Available only ..."
1,2,ham,Ok lar... Joking wif u oni...
2,3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,4,ham,U dun say so early hor... U c already then say...
4,5,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
len(mails_dataset)

5572

In [6]:
mails_dataset['Label'].value_counts()  #count number of each Label

ham     4825
spam     747
Name: Label, dtype: int64

In [7]:
total_mails = mails_dataset.shape[0]            #total number on instances in our dataset
total_mails

5572

## STEP 4: DATA PREPROCESSING

We need to clean our data for further processing. Emails may contain a lot of undesirable characters like punctuation marks, stop words, digits, etc which may not be helpful in detecting the spam email

###  A. Convert to lowercase

In [8]:
#convert the data into lower case
mails_dataset['Message'] =  mails_dataset['Message'].str.lower()
mails_dataset.head()

Unnamed: 0,DocID,Label,Message
0,1,ham,"go until jurong point, crazy.. available only ..."
1,2,ham,ok lar... joking wif u oni...
2,3,spam,free entry in 2 a wkly comp to win fa cup fina...
3,4,ham,u dun say so early hor... u c already then say...
4,5,ham,"nah i don't think he goes to usf, he lives aro..."


### B. Convert categorical values to numbers



In [9]:
'''
    ham : 0
    spam : 1
    
'''
mails_dataset['Label'] = mails_dataset['Label'].map({'ham': 0, 'spam': 1})
mails_dataset.head()

Unnamed: 0,DocID,Label,Message
0,1,0,"go until jurong point, crazy.. available only ..."
1,2,0,ok lar... joking wif u oni...
2,3,1,free entry in 2 a wkly comp to win fa cup fina...
3,4,0,u dun say so early hor... u c already then say...
4,5,0,"nah i don't think he goes to usf, he lives aro..."


### C. Remove digits and punctutations

In [10]:
#remove all digits
mails_dataset['Message'] = mails_dataset['Message'].str.replace('\d+.\d+', '')
mails_dataset.head()

Unnamed: 0,DocID,Label,Message
0,1,0,"go until jurong point, crazy.. available only ..."
1,2,0,ok lar... joking wif u oni...
2,3,1,free entry in 2 a wkly comp to win fa cup fina...
3,4,0,u dun say so early hor... u c already then say...
4,5,0,"nah i don't think he goes to usf, he lives aro..."


In [11]:
'''
     ^   :  Not these characters
     \w  :  Word characters
     \s  :  Space characters

    Replace any character that is not a word character or a space character with nothing/blank.
    
'''
mails_dataset['Message'] = mails_dataset['Message'].str.replace('[^\w\s]', '')
mails_dataset.head()

Unnamed: 0,DocID,Label,Message
0,1,0,go until jurong point crazy available only in ...
1,2,0,ok lar joking wif u oni
2,3,1,free entry in 2 a wkly comp to win fa cup fina...
3,4,0,u dun say so early hor u c already then say
4,5,0,nah i dont think he goes to usf he lives aroun...


In [12]:
sample_mail = mails_dataset.iloc[0]
sample_mail['Message']

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

### D. Convert all the slang words to corresponding formal words

Slang is the popular informal form of a word or group of words.

In [13]:
#create a dictionary of slang words and their corresponding terms

slang_list = {'u': 'you', 'r': 'are', 'd': "the", 'urs' : 'yours', 'wkly' : 'weekly', 'st' : 'such that', 
              'txt': 'text','comp': 'competition', 'prctc' : 'practice', 'dffrnc': 'difference', 'y': 'why', 
              'f9':'fine', 'tkts': 'tickets', 'csh': 'cash', 'phn': 'phone', 'im': 'i am', 'm': 'am', 
              'spcl': 'special', 'fone': 'phone', 'wks' : 'weeks', 'å': 'a', 'n': 'and', 'wat':'what'}


In [14]:
#replace slang with formal word

sample_mail = mails_dataset.iloc[0]
message = sample_mail['Message']
print(message)

new_message = ' '.join(slang_list[i] if i in slang_list else i for i in message.split())
new_message

go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat


'go until jurong point crazy available only in bugis and great world la e buffet cine there got amore what'

In [15]:
#applying to all rows

def convert_slangs(row):
    message = row['Message']
    new_message = ' '.join(slang_list[i] if i in slang_list else i for i in message.split())
    return new_message

mails_dataset['Message'] = mails_dataset.apply(convert_slangs, axis=1)
mails_dataset.head()

Unnamed: 0,DocID,Label,Message
0,1,0,go until jurong point crazy available only in ...
1,2,0,ok lar joking wif you oni
2,3,1,free entry in 2 a weekly competition to win fa...
3,4,0,you dun say so early hor you c already then say
4,5,0,nah i dont think he goes to usf he lives aroun...


### E. Tokenization

In [16]:
#pick every message and convert it into tokens

def identify_tokens(row):
    message = row['Message']
    tokens = word_tokenize(message)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

mails_dataset['Tokens'] = mails_dataset.apply(identify_tokens, axis=1)
mails_dataset.head()

Unnamed: 0,DocID,Label,Message,Tokens
0,1,0,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,2,0,ok lar joking wif you oni,"[ok, lar, joking, wif, you, oni]"
2,3,1,free entry in 2 a weekly competition to win fa...,"[free, entry, in, a, weekly, competition, to, ..."
3,4,0,you dun say so early hor you c already then say,"[you, dun, say, so, early, hor, you, c, alread..."
4,5,0,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


### F. Stemming / Lemmatization

Both processes reduce the inflectional forms of word into a common base or root. But we are using lemmatization because it takes care of the context, while stemming simply performs crude cutoff...

In [17]:
'''
stemming = PorterStemmer()
sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['Tokens']
stemmed_list = [stemming.stem(word) for word in tokens]
stemmed_list
'''

"\nstemming = PorterStemmer()\nsample_mail = mails_dataset.iloc[0]\ntokens = sample_mail['Tokens']\nstemmed_list = [stemming.stem(word) for word in tokens]\nstemmed_list\n"

In [18]:
lemmatizer = WordNetLemmatizer() 

sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['Tokens']
lemmatize_list = [lemmatizer.lemmatize(word) for word in tokens]
lemmatize_list

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'and',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'what']

In [19]:
def lemmatize_tokens(row):
    tokens = row['Tokens']
    lemmatized_list = [lemmatizer.lemmatize(word) for word in tokens]
    return (lemmatized_list)

mails_dataset['Tokens'] = mails_dataset.apply(lemmatize_tokens, axis=1)
mails_dataset.head()

Unnamed: 0,DocID,Label,Message,Tokens
0,1,0,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,2,0,ok lar joking wif you oni,"[ok, lar, joking, wif, you, oni]"
2,3,1,free entry in 2 a weekly competition to win fa...,"[free, entry, in, a, weekly, competition, to, ..."
3,4,0,you dun say so early hor you c already then say,"[you, dun, say, so, early, hor, you, c, alread..."
4,5,0,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, go, to, usf, he, lif..."


### G. Remove stopwords

Stopwords are common words that carry less important meaning than keywords. So, we will remove them.

In [20]:
stop_words = set(stopwords.words('english'))
print(stop_words,)

{'mustn', 'wasn', 'which', 'its', 'below', 'has', 'are', 'should', 'yourselves', 'the', 'some', 'against', 'myself', "you'd", 'a', 'because', 'why', 'his', 'few', 'during', "didn't", 'wouldn', 'same', 'itself', 'only', 'can', 'our', 'nor', 'into', 've', "she's", "mightn't", 's', 'if', 'having', 'doing', "weren't", 'is', 'had', 'and', 'such', 'isn', 'but', 'off', 'to', 'herself', 'not', 'whom', 'will', 'very', 'theirs', 'my', 'with', 'themselves', "shouldn't", 'how', 'before', 'then', 'in', 'that', 'what', 'couldn', "you'll", 'do', 'both', "isn't", 'yours', 'until', 're', 'hasn', 'they', 'between', 'other', 'out', 'just', 'didn', 'hers', 'mightn', 'under', 'these', "wouldn't", 'does', 'were', 'at', 'there', 'on', 'once', 'about', 'weren', 'd', 'over', 'your', 'ourselves', 'aren', 'an', "won't", 'so', 'them', 'll', 'him', 'most', 'than', 'again', 'it', 'won', 'her', 'was', 'down', "mustn't", 'o', 'himself', 'be', 'y', 'ma', 'being', 'for', 'who', 'hadn', 'shan', 'ours', 'by', 'did', 'doe

In [21]:
def remove_stopwords(row):
    tokens = row['Tokens']
    filtered_list = [w for w in tokens if not w in stop_words]
    return (filtered_list)

mails_dataset['Tokens'] = mails_dataset.apply(remove_stopwords , axis=1)
mails_dataset.head()

Unnamed: 0,DocID,Label,Message,Tokens
0,1,0,go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, g..."
1,2,0,ok lar joking wif you oni,"[ok, lar, joking, wif, oni]"
2,3,1,free entry in 2 a weekly competition to win fa...,"[free, entry, weekly, competition, win, fa, cu..."
3,4,0,you dun say so early hor you c already then say,"[dun, say, early, hor, c, already, say]"
4,5,0,nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, go, usf, life, around, though]"


### >>>> Now our data is clean and ready for further processing <<<<

## STEP 5: CREATING DOCUMENT TERM MATRIX


### A. Convert each document to a count vector

In [22]:
#list of tokens are combined to create the message

tkn = mails_dataset['Tokens'].iloc[0]
msg = ' '.join(tkn) 
msg

'go jurong point crazy available bugis great world la e buffet cine got amore'

In [23]:
corpus = []                          #list of prepocessed documents, where each term is the text of the document
for row in range(total_mails):
    tkn = mails_dataset['Tokens'].iloc[row]
    msg = ' '.join(tkn)
    corpus.append(msg)

corpus[:5]   #print first 5 instances

['go jurong point crazy available bugis great world la e buffet cine got amore',
 'ok lar joking wif oni',
 'free entry weekly competition win fa cup final ticket may text fa receive entry questionstd text ratetcs apply',
 'dun say early hor c already say',
 'nah dont think go usf life around though']

In [24]:
#document term matrix
vec = CountVectorizer()
X = vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
df.head()   

Unnamed: 0,aa,aah,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,aberdeen,...,åð,åòharry,åòits,åômorrow,åôrents,ìï,ìïll,ûï,ûïharry,ûò
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
vector_dim = len(vec.get_feature_names())   #dimension of each vector
vector_dim

7534

In [26]:
terms = np.array(df.columns.values)
terms

array(['aa', 'aah', 'aaooooright', ..., 'ûï', 'ûïharry', 'ûò'],
      dtype=object)

In [27]:
df_matrix = df.values
df_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
df_normalized = preprocessing.normalize(df_matrix, norm='l2')
df_normalized

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
unit_vectors = pd.DataFrame(data=df_normalized, columns=terms)
unit_vectors.head()

Unnamed: 0,aa,aah,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,aberdeen,...,åð,åòharry,åòits,åômorrow,åôrents,ìï,ìïll,ûï,ûïharry,ûò
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
#Create final dtm. column 0 will tell the document number while column 1 will tell the label. 

dtm = pd.concat([mails_dataset,unit_vectors], axis=1, ignore_index=False, sort=False).reset_index(drop=True)
drop_list = ['Message', 'Tokens']
dtm.drop(drop_list, axis = 1, inplace=True)
dtm.head()

Unnamed: 0,DocID,Label,aa,aah,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,...,åð,åòharry,åòits,åômorrow,åôrents,ìï,ìïll,ûï,ûïharry,ûò
0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### >>>Now every document in the corpus has a vector representation<<<

## STEP 6: CREATING TEST AND TRAIN SETS

We will randomly split our dataset in 80–20 ratio. Where 80% of the total data will be used as training set and rest 20% will be considered as test set. 

In [31]:
X = dtm.drop('Label',axis=1) 
y = dtm['Label']

#random state = 0, will give same split evry time. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [32]:
test_size, train_size = X_test.shape[0], X_train.shape[0]
print("Number of instance in :\n Training set = ", train_size, "\n Test set = ", test_size)

Number of instance in :
 Training set =  4457 
 Test set =  1115


In [33]:
X_train.head()

Unnamed: 0,DocID,aa,aah,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,åð,åòharry,åòits,åômorrow,åôrents,ìï,ìïll,ûï,ûïharry,ûò
1114,1115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3589,3590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3095,3096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1012,1013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0
3320,3321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#y_train is an array. so in order to concat it with our X variables, we need to convert it into dataframe

labels_train = pd.DataFrame(y_train, columns = ['Label'])
labels_train.head()

Unnamed: 0,Label
1114,0
3589,0
3095,0
1012,0
3320,0


In [35]:
'''
    We have X_train, y_train, X_test, y_test.
    Using these lists and dataframes we will randomly create two non-overlapping datasets 
        1. training set
        2. testing set
'''

#creating training set
train_dtm = pd.concat([X_train, labels_train], axis = 1).reset_index(drop=True)
train_dtm.head()

Unnamed: 0,DocID,aa,aah,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,åòharry,åòits,åômorrow,åôrents,ìï,ìïll,ûï,ûïharry,ûò,Label
0,1115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,3590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0
4,3321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [36]:
# Our training set is ready, similarly creating test set

labels_test = pd.DataFrame(y_test, columns = ['Label'])
test_dtm = pd.concat([X_test, labels_test], axis = 1).reset_index(drop=True)
test_dtm.head()

Unnamed: 0,DocID,aa,aah,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,åòharry,åòits,åômorrow,åôrents,ìï,ìïll,ûï,ûïharry,ûò,Label
0,4457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## STEP 7: TRAINING CLASSIFIER

Calculate centroids of each class

In [37]:
total_classes = dtm['Label'].nunique()
total_classes      # total number of classes

2

In [38]:
class_instances = np.zeros(total_classes)
for ci in range(total_classes):
    
    for example in range(train_size):
        label = train_dtm['Label'].iloc[example]
        if(label==ci):
            class_instances[ci] = class_instances[ci] + 1     
    

In [39]:
class_instances

array([3876.,  581.])

In [40]:
class_centroids = np.ndarray((total_classes, vector_dim))         #empty array
class_set = {}           #divide the instances into classes

#create set of examples with same class labels and then compute prior prob of each class
for ci in range(total_classes):
    class_set[ci] = train_dtm[train_dtm['Label'] == ci].drop(['DocID','Label'],axis=1)     
    
    #count number of instances in the class
    count_instances = len(class_set[ci])
    
    #mean of each column in the separated dataset
    class_centroids[ci] = class_set[ci].mean(axis = 0)    #axis=0: column wise
    
print(class_centroids)    

[[6.66147806e-05 2.06788273e-04 1.82432090e-04 ... 0.00000000e+00
  0.00000000e+00 4.54907605e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 4.17445138e-04
  4.44404285e-04 0.00000000e+00]]


## STEP 8: TESTING CLASSIFIER

Predicting the output class for the every instance in the test data

### A. Compute distance / similarity

Calculate distance between test data and each example of training data using Euclidean distance.


In [41]:
#train_dtm.iloc[2][1:-1]

In [42]:
'''
    Function to calculate Euclidean distance between two vectors. 
      -  One is test vector(whose class is to be predicted)
      -  Other is given example(whose class is known)

'''

def euclideanDistance(test_case, data, vector_dim):
    distance = 0        
    i =  1                      #starting from position 1, as position 0 contains DocID
    for i in range(vector_dim):          
        distance += pow((test_case[i] - data[i]), 2)          #formula for Euclidean Distance
    return math.sqrt(distance)

In [43]:
d1 = test_dtm.iloc[2]
d2 = train_dtm.iloc[5]
distance = euclideanDistance(d1, class_centroids[0], vector_dim)
distance

945.0004659877225

###  Predict the class


In [44]:
import operator

def predictClass(test_case):
    distances = []
    
    for ci in range(total_classes):
        centroid = class_centroids[ci]
        dist = euclideanDistance(test_case, centroid, vector_dim)
        distances.append((ci, dist))  #store (Class,Distance from test_case)
        
    dis = dict(distances)                                           # key : DocID, value : Distance       
    sorted_distances = sorted(dis.items(), key=operator.itemgetter(1))     #itemgetter(1): sort on values
     
    pclass = sorted_distances[0][0]
    return pclass           

In [45]:
data = train_dtm.iloc[3]
#data

In [46]:
predictClass(data)

0

In [47]:
'''
    Determine the distance of each test case against all classes. Select top k neighbors with 
    minimun distance from test cases and assign the label which has maximum number of nereast neighbors.

'''       
predictions = []                       #to store prediction of each test example\

for row in range(test_size): 
    test_case = test_dtm.iloc[row]

    #predict the class label for each example and append to the predictions list
    predictions.append(predictClass(test_case))

#predictions

<b> Testing is over ! </b>
<br>We have predicted labels for each sample in the test set.

## STEP 8: ACCURACY OF THE CLASSIFIER

Accuracy is the fraction of correct predictions our model out of total predictions. 
Formally, accuracy has the following definition:
<br><br>
<center><b> Accuracy = (Number of correct predictions) / (Number of total predictions) 


In [48]:
#calculate accuracy
predict_labels = np.array(predictions)
actual_labels = np.array(test_dtm['Label'])

test_accuracy = np.sum(predict_labels == actual_labels)/float(test_size) 

print ("******* Test Set Examples ******* : ", test_size)
print ("******* Test Set Accuracy ******* : ", (test_accuracy*100) ,"%") 

******* Test Set Examples ******* :  1115
******* Test Set Accuracy ******* :  85.1121076233184 %
