In [None]:
import pandas as pd 
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [None]:
def dataset(path,isTest = False):
    data = pd.read_csv(path)
    data.shape
    grouped = data.groupby('Topic')

    new_df = pd.DataFrame([], columns = data.columns)

    for key, values in grouped:
        if isTest:
            new_df = pd.concat([new_df, grouped.get_group(key)[500:500+200]], 0,ignore_index=True)
        else:
            new_df = pd.concat([new_df, grouped.get_group(key)[:500]], 0,ignore_index=True)
    return new_df

In [None]:
train_data = dataset('./Data/train.csv')
nan_value = float("NaN")
train_data.replace("", nan_value, inplace=True)
train_data.dropna( inplace=True)
train_data.reset_index(drop=True,inplace=True)
train_data.shape

In [None]:
i = 0
topic_map = {}
topic_map_reverse = {}
for topic in set(train_data['Topic']):
    topic_map[topic] = i
    topic_map_reverse[i] = topic
    i+=1
total_topic = i


In [None]:
vocabulary = {}
pos = 0
for i in range(train_data.shape[0]):
    text =train_data['Body'][i]
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

        # #Tokenize
    text = word_tokenize(text)
        # print("\n===After Tokenizing:===\n", text)

        # #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
        # print("\n===After Stopword Removal:===\n", text)

        # #Lemmatize tokens
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
        # print("\n===After Lemmatization:===\n", text)

        # #Stemming tokens
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
        # print("\n===After Stemming:===\n", text)
    for word in text:
        if word not in vocabulary.keys():

            vocabulary[word] = pos
            pos+=1
total_vocabulary_size = pos
print(pos)

In [None]:
binray_vectors = []
for i in range(train_data.shape[0]):
    binray_vector = []
    for i in range(total_vocabulary_size):
        binray_vector.append(0)
    binray_vectors.append(binray_vector)

In [None]:
for i in range(train_data.shape[0]):
    text =train_data['Body'][i]
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

    text = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]

    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]

    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    for word in text:
        pos = vocabulary[word]
        binray_vectors[i][pos]=1

In [None]:
test_data = dataset('./Data/train.csv',True)
nan_value = float("NaN")
test_data.replace("", nan_value, inplace=True)
test_data.dropna( inplace=True)
test_data.reset_index(drop=True,inplace=True)
test_data.shape

In [None]:
miss = 0

k = 1

for l in range(test_data.shape[0]):
    text = test_data['Body'][l]
    test_vector = []
    for i in range(total_vocabulary_size):
        test_vector.append(0)
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

    text = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]

    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    for word in text:
        if word not in vocabulary.keys():
            continue
        else:
            pos  = vocabulary[word]
            test_vector[pos] = 1
    hamming_distance = []
    for i in range(len(binray_vectors)):
        hamming_distance.append(0)
    for i in range(total_vocabulary_size):
        for j in range(len(binray_vectors)):
            hamming_distance[j]+= abs(binray_vectors[j][i]-test_vector[i]) 
    prediction = -1
    if k==1:
        prediction = np.argmin(hamming_distance)
        prediction = train_data['Topic'][prediction]
    if k==3:
        prediction = np.argsort(hamming_distance)[:5]
        topic_list = []
        for i in range(3):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    if k==5:
        prediction = np.argsort(hamming_distance)[:5]
        topic_list = []
        for i in range(5):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 

    
    if prediction != test_data['Topic'][l]:
        miss+=1
    print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
    #break
    
print('Miss: ',miss,' Correct: ',test_data.shape[0]-miss,' Accuracy: ',((test_data.shape[0]-miss)*100)/test_data.shape[0])

In [None]:
BoWs = []
for i in range(train_data.shape[0]):
    BoW = []
    for j in range(total_vocabulary_size):
        BoW.append(0)
    BoWs.append(BoW)
print(len(BoWs))

In [None]:
for i in range(train_data.shape[0]):
    text =train_data['Body'][i]
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

    text = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    for word in text:
        pos = vocabulary[word]
        BoWs[i][pos]+=1
    

In [None]:
miss = 0

k = 3

for l in range(test_data.shape[0]):
    text = test_data['Body'][l]
    test_vector = []
    for i in range(total_vocabulary_size):
        test_vector.append(0)
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

    text = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]

    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]

    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    
    for word in text:
        if word not in vocabulary.keys():
            continue
        else:
            pos  = vocabulary[word]
            test_vector[pos] += 1
    euclidean_distance = []
    for i in range(len(BoWs)):
        euclidean_distance.append(0)
    for i in range(total_vocabulary_size):
        for j in range(len(BoWs)):
            euclidean_distance[j]+= (BoWs[j][i]-test_vector[i])**2 
    prediction = -1
    if k==1:
        prediction = np.argmin(euclidean_distance)
        prediction = train_data['Topic'][prediction]
    if k==3:
        prediction = np.argsort(euclidean_distance)[:3]
        topic_list = []
        for i in range(3):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    if k==5:
        prediction = np.argsort(euclidean_distance)[:5]
        #print(prediction)
        topic_list = []
        for i in range(5):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 

    
    if prediction != test_data['Topic'][l]:
        miss+=1
    print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
    #break
    
print('Miss: ',miss,' Correct: ',test_data.shape[0]-miss,' Accuracy: ',((test_data.shape[0]-miss)*100)/test_data.shape[0])

In [None]:
total_words = []
for i in range(train_data.shape[0]):
    total_word = 0
    for j in range(total_vocabulary_size):
        total_word+=BoWs[i][j]
    total_words.append(total_word)

TF = []
for i in range(train_data.shape[0]):
    tf = []
    for j in range(total_vocabulary_size):
        tf.append(BoWs[i][j]/total_words[i])
    TF.append(tf)

IDF = []
for i in range(train_data.shape[0]):
    idf = []
    for j in range(total_vocabulary_size):
        if BoWs[i][j] == 0:
            idf.append(np.log(total_words[i]/(1+BoWs[i][j])))
        else:
            idf.append(np.log(total_words[i]/(BoWs[i][j])))
    IDF.append(idf)

TF_IDF = []
for i in range(train_data.shape[0]):
    tf_idf = []
    for j in range(total_vocabulary_size):
       tf_idf.append(TF[i][j]*IDF[i][j]) 
    TF_IDF.append(tf_idf)

values = []
for i in range(train_data.shape[0]):
    value = 0
    for j in range(total_vocabulary_size):
        value+=BoWs[i][j]**2
    values.append(np.sqrt(value))
    
print(BoWs[0][:10],TF[0][:10],IDF[0][:10],TF_IDF[0][:10])
#print(values)

In [None]:
miss = 0

k = 3

for l in range(test_data.shape[0]):
    text = test_data['Body'][l]
    test_vector = []
    for i in range(total_vocabulary_size):
        test_vector.append(0)

    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

    text = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]

    total_word = 0
    for word in text:
        if word not in vocabulary.keys():
            continue
        else:
            pos  = vocabulary[word]
            test_vector[pos] += 1
            total_word += 1

    test_value = 0
    for i in range(len(test_vector)):
        test_vector[i]/=total_word
        if test_vector[i] == 0:
            test_vector[i]*=np.log(total_word/(1+test_vector[i]))
        else:
            test_vector[i]*=np.log(total_word/(test_vector[i]))
        test_value += test_vector[i]**2
    test_value = np.sqrt(test_value)
    
    
    cossian_similarity = []
    for i in range(len(TF_IDF)):
        cossian_similarity.append(0)
    for i in range(total_vocabulary_size):
        for j in range(len(BoWs)):
            cossian_similarity[j]+= (TF_IDF[j][i]*test_vector[i])/(test_value*values[j])
    #print(cossian_similarity)

    prediction = -1
    if k==1:
        prediction = np.argmax(cossian_similarity)
        prediction = train_data['Topic'][prediction]
    if k==3:
        prediction = np.argsort(cossian_similarity)[-3:]
        #print(prediction)
        topic_list = []
        for i in range(3):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    if k==5:
        prediction = np.argsort(cossian_similarity)[-3:]
        topic_list = []
        for i in range(5):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 

    
    if prediction != test_data['Topic'][l]:
        miss+=1
    print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
    #break
    
print('Miss: ',miss,' Correct: ',test_data.shape[0]-miss,' Accuracy: ',((test_data.shape[0]-miss)*100)/test_data.shape[0])