In [None]:
import pandas as pd 
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [None]:
def dataset(path,datatype):
    data = pd.read_csv(path)
    #data.shape
    grouped = data.groupby('Topic')

    new_df = pd.DataFrame([], columns = data.columns)

    for key, values in grouped:
        if datatype==0:
            new_df = pd.concat([new_df, grouped.get_group(key)[:50]], 0,ignore_index=True)
        elif datatype==1:
            new_df = pd.concat([new_df, grouped.get_group(key)[50:50+20]], 0,ignore_index=True)
        else:
            new_df = pd.concat([new_df, grouped.get_group(key)[50+20:50+20+50]], 0,ignore_index=True)
    return new_df

In [None]:
train_data = dataset('./Data/train.csv',0)
nan_value = float("NaN")
train_data.replace("", nan_value, inplace=True)
train_data.dropna( inplace=True)
train_data.reset_index(drop=True,inplace=True)
train_data.shape

In [None]:
i = 0
topic_map = {}
topic_map_reverse = {}
for topic in set(train_data['Topic']):
    topic_map[topic] = i
    topic_map_reverse[i] = topic
    i+=1
total_topic = i
print(total_topic)

In [None]:
def text_processing(text):
    text = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    return text

In [None]:
vocabulary = {}
pos = 0
for i in range(train_data.shape[0]):
    text =train_data['Body'][i]
    text = text_processing(text)

    for word in text:
        if word not in vocabulary.keys():

            vocabulary[word] = pos
            pos+=1
total_vocabulary_size = pos
print(pos)

In [None]:
binray_vectors = []
for i in range(train_data.shape[0]):
    binray_vector = []
    for i in range(total_vocabulary_size):
        binray_vector.append(0)
    binray_vectors.append(binray_vector)

In [None]:
BoWs = []
for i in range(train_data.shape[0]):
    BoW = []
    for j in range(total_vocabulary_size):
        BoW.append(0)
    BoWs.append(BoW)
print(len(BoWs))

In [None]:
for i in range(train_data.shape[0]):
    text =train_data['Body'][i]
    text = text_processing(text)

    for word in text:
        pos = vocabulary[word]
        binray_vectors[i][pos]=1
        BoWs[i][pos]+=1

In [None]:
total_words = []
for i in range(train_data.shape[0]):
    total_word = 0
    for j in range(total_vocabulary_size):
        total_word+=BoWs[i][j]
    total_words.append(total_word)

TF = []
for i in range(train_data.shape[0]):
    tf = []
    for j in range(total_vocabulary_size):
        tf.append(BoWs[i][j]/total_words[i])
    TF.append(tf)

IDF = []
for i in range(train_data.shape[0]):
    idf = []
    for j in range(total_vocabulary_size):
        if BoWs[i][j] == 0:
            idf.append(np.log(total_words[i]/(1+BoWs[i][j])))
        else:
            idf.append(np.log(total_words[i]/(BoWs[i][j])))
    IDF.append(idf)

TF_IDF = []
for i in range(train_data.shape[0]):
    tf_idf = []
    for j in range(total_vocabulary_size):
       tf_idf.append(TF[i][j]*IDF[i][j]) 
    TF_IDF.append(tf_idf)

values = []
for i in range(train_data.shape[0]):
    value = 0
    for j in range(total_vocabulary_size):
        value+=BoWs[i][j]**2
    values.append(np.sqrt(value))
    
#print(BoWs[0][:10],TF[0][:10],IDF[0][:10],TF_IDF[0][:10])
#print(values)

In [None]:
val_data = dataset('./Data/train.csv',1)
nan_value = float("NaN")
val_data.replace("", nan_value, inplace=True)
val_data.dropna( inplace=True)
val_data.reset_index(drop=True,inplace=True)
val_data.shape

In [None]:
test_data = dataset('./Data/train.csv',2)
nan_value = float("NaN")
test_data.replace("", nan_value, inplace=True)
test_data.dropna( inplace=True)
test_data.reset_index(drop=True,inplace=True)
test_data.shape

In [None]:
def hamming_distance_fun(k,text):
    test_vector = [0]*total_vocabulary_size
    for word in text:
        if word not in vocabulary.keys():
            continue
        else:
            pos  = vocabulary[word]
            test_vector[pos] = 1
    hamming_distance = []
    for i in range(len(binray_vectors)):
        hamming_distance.append(0)
    for i in range(total_vocabulary_size):
        for j in range(len(binray_vectors)):
            hamming_distance[j]+= abs(binray_vectors[j][i]-test_vector[i]) 
    prediction = -1
    if k==1:
        prediction = np.argmin(hamming_distance)
        prediction = train_data['Topic'][prediction]
    if k==3:
        prediction = np.argsort(hamming_distance)[:3]
        topic_list = []
        for i in range(3):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    if k==5:
        prediction = np.argsort(hamming_distance)[:5]
        topic_list = []
        for i in range(5):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count)
    return prediction 

In [None]:
def euclidean_distanc_fun(k,text):
    test_vector = [0]*total_vocabulary_size
    for word in text:
        if word not in vocabulary.keys():
            continue
        else:
            pos  = vocabulary[word]
            test_vector[pos] += 1

    euclidean_distance = []
    for i in range(len(BoWs)):
        euclidean_distance.append(0)
    for i in range(total_vocabulary_size):
        for j in range(len(BoWs)):
            euclidean_distance[j]+= (BoWs[j][i]-test_vector[i])**2 
    prediction = -1
    if k==1:
        prediction = np.argmin(euclidean_distance)
        prediction = train_data['Topic'][prediction]
    if k==3:
        prediction = np.argsort(euclidean_distance)[:3]
        topic_list = []
        for i in range(3):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    if k==5:
        prediction = np.argsort(euclidean_distance)[:5]
        #print(prediction)
        topic_list = []
        for i in range(5):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count)
    return prediction

In [None]:
def cosine_similarity_fun(k,text):
    test_vector = [0]*total_vocabulary_size
    total_word = 0
    for word in text:
        if word not in vocabulary.keys():
            continue
        else:
            pos  = vocabulary[word]
            test_vector[pos] += 1
            total_word += 1
    
    test_value = 0
    for i in range(len(test_vector)):
        test_vector[i]/=total_word
        if test_vector[i] == 0:
            test_vector[i]*=np.log(total_word/(1+test_vector[i]))
        else:
            test_vector[i]*=np.log(total_word/(test_vector[i]))
        test_value += test_vector[i]**2
    test_value = np.sqrt(test_value)
    
    cosine_similarity = []
    for i in range(len(TF_IDF)):
        cosine_similarity.append(0)

    for i in range(total_vocabulary_size):
        for j in range(len(BoWs)):
            cosine_similarity[j]+= (TF_IDF[j][i]*test_vector[i])/(test_value*values[j])
            

    prediction = -1
    if k==1:
        prediction = np.argmax(cosine_similarity)
        prediction = train_data['Topic'][prediction]
    if k==3:
        prediction = np.argsort(cosine_similarity)[-3:]
        #print(prediction)
        topic_list = []
        for i in range(3):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    if k==5:
        prediction = np.argsort(cosine_similarity)[-5:]
        topic_list = []
        for i in range(5):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    return prediction

In [None]:
for k in [1,3,5]:
    for tech in ['hamming','euclidean','cosine']:
        miss = 0
        for l in range(val_data.shape[0]):
            text =val_data['Body'][l]
            text = text_processing(text)
            if tech is 'hamming':
                prediction = hamming_distance_fun(k,text)
            if tech is 'euclidean':
                prediction = euclidean_distanc_fun(k,text)
            if tech is 'cosine':
                prediction = cosine_similarity_fun(k,text)
            if prediction != val_data['Topic'][l]:
                miss+=1
                #print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
        print('K:',k,'Technique:',tech,'Miss: ',miss,' Correct: ',val_data.shape[0]-miss,' Accuracy: ',((val_data.shape[0]-miss)*100)/val_data.shape[0])


In [None]:
def get_word_prob(alpha):
    topic_prob = []
    for i in range(total_topic):
        topic_count = 0
        for j in range(train_data.shape[0]):
            topic_name = train_data['Topic'][j]
            if topic_map[topic_name] == i:
                topic_count += 1
        topic_prob.append(topic_count/train_data.shape[0])
        
    word_prob = []
    for i in range(total_vocabulary_size):
        topic_word = []
        for j in range(total_topic):
            topic_word.append(0)
        word_prob.append(topic_word)
    for i in range(train_data.shape[0]):
        text = train_data['Body'][i]
        text = text_processing(text)
        for word in text:
            pos = vocabulary[word]
            topic_pos = topic_map[train_data['Topic'][i]]
            word_prob[pos][topic_pos]+=1

    column_sums = np.array(word_prob).sum(axis=0)
    for i in range(total_vocabulary_size):
        for j in range(total_topic):
            word_prob[i][j] = (word_prob[i][j]+alpha)/(column_sums[j]+ alpha*total_vocabulary_size)
    return topic_prob,word_prob

In [None]:
for alpha in [0.01,.1,.5,1,5,10,50,100,500,1000]:

    topic_prob,word_prob = get_word_prob(alpha)
    

    miss = 0
    for l in range(val_data.shape[0]):
        text = val_data['Body'][l]
        text = text_processing(text)

        prob_class = []
        for i in range(total_topic):
            prob_class.append(topic_prob[i])
        ##print(prob_class)

        for word in text:
            if word not in vocabulary.keys():
                continue
            else:
                pos  = vocabulary[word]
                for i in range(total_topic):
                    prob_class[i]*=word_prob[pos][i]
        prediction = -1
        
        prediction = np.argmax(prob_class)
        prediction = topic_map_reverse[prediction]
        
        if prediction != val_data['Topic'][l]:
            miss+=1
        #print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
        #break
        
    print('Alpha: ',alpha,'Miss: ',miss,' Correct: ',val_data.shape[0]-miss,' Accuracy: ',((val_data.shape[0]-miss)*100)/val_data.shape[0])

In [None]:
knn_acc = []
naive_acc = []
topic_prob,word_prob = get_word_prob(0.5)
for i in range(1,51):
    grouped = test_data.groupby('Topic')

    new_df = pd.DataFrame([], columns = test_data.columns)

    for key, _ in grouped:
        new_df = pd.concat([new_df, grouped.get_group(key)[(i-1)*1:i*1]], 0,ignore_index=True)

    miss = 0
    for l in range(new_df.shape[0]):
        text = val_data['Body'][l]
        text = text_processing(text)

        prob_class = []
        for i in range(total_topic):
            prob_class.append(topic_prob[i])

        for word in text:
            if word not in vocabulary.keys():
                continue
            else:
                pos  = vocabulary[word]
                for i in range(total_topic):
                    prob_class[i]*=word_prob[pos][i]
        prediction = -1
        
        prediction = np.argmax(prob_class)
        prediction = topic_map_reverse[prediction]
        
        if prediction != new_df['Topic'][l]:
            miss+=1
        #print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
        #break
        
    print('Naive: ','Miss: ',miss,' Correct: ',new_df.shape[0]-miss,' Accuracy: ',((new_df.shape[0]-miss)*100)/new_df.shape[0])
    naive_acc.append(((new_df.shape[0]-miss)*100)/new_df.shape[0])

    miss = 0
    
    for l in range(new_df.shape[0]):
        text =new_df['Body'][l]
        text = text_processing(text)
        prediction = cosine_similarity_fun(5,text)
        if prediction != new_df['Topic'][l]:
            miss+=1
            #print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
    print('KNN: ','Miss: ',miss,' Correct: ',new_df.shape[0]-miss,' Accuracy: ',((new_df.shape[0]-miss)*100)/new_df.shape[0])
    knn_acc.append(((new_df.shape[0]-miss)*100)/new_df.shape[0])


In [None]:
from scipy.stats import ttest_ind
stat, p = ttest_ind(knn_acc, naive_acc)
print('Statistics=%.3f, p=%.3f' % (stat, p))