In [1]:
import pandas as pd 
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [2]:
def dataset(path,isTest = False):
    data = pd.read_csv(path)
    data.shape
    grouped = data.groupby('Topic')

    new_df = pd.DataFrame([], columns = data.columns)

    for key, values in grouped:
        if isTest:
            new_df = pd.concat([new_df, grouped.get_group(key)[:25]], 0,ignore_index=True)
        else:
            new_df = pd.concat([new_df, grouped.get_group(key)[:100]], 0,ignore_index=True)
    return new_df

In [3]:
train_data = dataset('./Data/train.csv')
train_data = train_data.sample(frac = 1) 
train_data.shape

(400, 2)

In [4]:
i = 0
topic_map = {}
topic_map_reverse = {}
for topic in set(train_data['Topic']):
    topic_map[topic] = i
    topic_map_reverse[i] = topic
    i+=1
total_topic = i


In [5]:
vocabulary = {}
pos = 0
for i in range(train_data.shape[0]):
    text =train_data['Body'][i]
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

        # #Tokenize
    text = word_tokenize(text)
        # print("\n===After Tokenizing:===\n", text)

        # #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
        # print("\n===After Stopword Removal:===\n", text)

        # #Lemmatize tokens
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
        # print("\n===After Lemmatization:===\n", text)

        # #Stemming tokens
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
        # print("\n===After Stemming:===\n", text)
    for word in text:
        if word not in vocabulary.keys():

            vocabulary[word] = pos
            pos+=1
total_vocabulary_size = pos
print(pos)

3928


In [6]:
documents_feature = []
for i in range(train_data.shape[0]):
    document_feature = []
    for i in range(total_vocabulary_size):
        document_feature.append(0)
    documents_feature.append(document_feature)

In [7]:
for i in range(train_data.shape[0]):
    text =train_data['Body'][i]
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

        # #Tokenize
    text = word_tokenize(text)
        # print("\n===After Tokenizing:===\n", text)

        # #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
        # print("\n===After Stopword Removal:===\n", text)

        # #Lemmatize tokens
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
        # print("\n===After Lemmatization:===\n", text)

        # #Stemming tokens
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
        # print("\n===After Stemming:===\n", text)
    for word in text:
        pos = vocabulary[word]
        documents_feature[i][pos]=1

In [8]:
test_data = dataset('./Data/test.csv',True)
test_data.shape

(100, 2)

In [9]:
miss = 0

k = 5

for l in range(test_data.shape[0]):
    text = test_data['Body'][l]
    test_vector = []
    for i in range(total_vocabulary_size):
        test_vector.append(0)
    if text == None or str(text) == 'nan' or len(text) == 0:
        continue

        # #Tokenize
    text = word_tokenize(text)
        # print("\n===After Tokenizing:===\n", text)

        # #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
        # print("\n===After Stopword Removal:===\n", text)

        # #Lemmatize tokens
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
        # print("\n===After Lemmatization:===\n", text)

        # #Stemming tokens
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
        # print("\n===After Stemming:===\n", text)
    for word in text:
        if word not in vocabulary.keys():
            continue
        else:
            pos  = vocabulary[word]
            test_vector[pos] = 1
    hamming_distance = []
    for i in range(len(documents_feature)):
        hamming_distance.append(0)
    for i in range(total_vocabulary_size):
        for j in range(len(documents_feature)):
            hamming_distance[j]+= abs(documents_feature[j][i]-test_vector[i]) 
    prediction = -1
    if k==1:
        prediction = np.argmin(hamming_distance)
        prediction = topic_map[test_data['Topic'][prediction]]
    if k==3:
        prediction = sorted(hamming_distance)[:3]
        topic_list = []
        for i in range(3):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 
    if k==5:
        prediction = sorted(hamming_distance)[:5]
        topic_list = []
        for i in range(5):
            topic_list.append(train_data['Topic'][prediction[i]])
        prediction = max(set(topic_list), key = topic_list.count) 

    
    if prediction != test_data['Topic'][l]:
        miss+=1
    print('Predicted: ',prediction,' Target: ',test_data['Topic'][l])
    #break
    
print('Miss: ',miss,' Correct: ',test_data.shape[0]-miss,' Accuracy: ',((test_data.shape[0]-miss)*100)/test_data.shape[0])

Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target:  3d_Printer
Predicted:  3d_Printer  Target: 