## KNN Model 57.7% Accuracy

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from nltk import word_tokenize
from time import time
import pandas as pd
import re
import numpy as np
import gensim
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/ktyser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ktyser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
train = pd.read_csv("Train-no-stopwords.csv")
train.sample(5)

Unnamed: 0,ID,Text,Label
545,ID_UBOLNVpY,papa wasankha episkopi watsopano mdziko la zam...,POLITICS
1156,ID_rRdPFWiq,chilima akhazikitsa utm ku blantyre wachiwir...,POLITICS
1174,ID_rppBoGTW,aletsa zionetsero ku nkhata bay gulu la omwe...,POLITICS
263,ID_JgJAOkcP,kafukufuku wa cashgate sadayankhe mabungwe o...,LAW/ORDER
1415,ID_zIhIOWoW,achotsa mavenda popanda ziwawa ku lilongwe k...,ECONOMY


In [3]:
test = pd.read_csv("Test.csv")
test.sample(5)

Unnamed: 0,ID,Text
204,ID_PtjshlCW,Superior Boxing Yakonza Nkhonya Yapamwamba Bun...
2,ID_AUJIHpZr,Anatcheleza: Akundiopseza a gogo wanga Akundi...
194,ID_OhTcDpdl,Za pasipoti ya Gaba sizikumveka Sabatayi yakh...
20,ID_BYTupytC,ECM Yapereka Ndondomeko Zatsopano za Mapempher...
230,ID_SkRPoSoE,Pulogalamu ya ECRP yatha Pulogalamu yophunzit...


In [4]:
punctuations = string.punctuation + "’¶•@°©®™"

In [5]:
def preprocess_text(text):
    """
    @param text string
    @return text string
    
    This function preprocess a given raw text by normalizing it to lowercase removing the stop words,
    punctuations and lemmatization
    """
        
    #string to lowercase
    txt = text.lower()
    
    # keep only ascii characters
    txt = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", txt)
    
    # punctuation removal and map it to space
    translator = str.maketrans(punctuations, " "*len(punctuations))
    s = txt.translate(translator)
    
    # remove digits 
    no_digits = ''.join([i for i in s if not i.isdigit()])
    cleaner = " ".join(no_digits.split())
    
    # tokenize words and removing stop words 
    word_tokens = word_tokenize(cleaner)
    filtered_sentence = [w for w in word_tokens if len(w)>2 ]
    filtered_sentence = " ".join(filtered_sentence)
    
    return filtered_sentence

In [6]:
# preprocessing the text
train['clean_text'] = train.Text.apply(preprocess_text)
test['clean_text'] = test.Text.apply(preprocess_text)

In [7]:
train.sample(5)

Unnamed: 0,ID,Text,Label,clean_text
37,ID_BxXcyHsk,mafumu ku zambia akulowerera kwa mkanda ch...,POLITICS,mafumu zambia akulowerera kwa mkanda chisale w...
1413,ID_zEqcsLPA,mutharika asankha nduna pangotha maola sipik...,POLITICS,mutharika asankha nduna pangotha maola sipikal...
1318,ID_vzswgDcX,covid-19 yakolera chiwerengero cha amalawi a...,HEALTH,covid yakolera chiwerengero cha amalawi amene ...
667,ID_ZNamCvoS,akayidi atsopano adziyezedwa coronavirus mps n...,HEALTH,akayidi atsopano adziyezedwa coronavirus mps n...
1097,ID_pMEOQLAq,mabungwe athotha galu wakuda ku mj mboma la ...,FARMING,mabungwe athotha galu wakuda mboma mulanje omw...


In [8]:
# creating the pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", KNeighborsClassifier()),
    ]
)

In [9]:
# evaluating model
scores = cross_val_score(pipeline, train['clean_text'], train['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.57291667 0.54006969 0.59930314 0.54703833 0.55749129]


In [10]:
print(f"Final score is {scores.mean()}")

Final score is 0.5633638211382113


In [11]:
# making predictions
pipeline.fit( train['clean_text'], train['Label'])
test.loc[:,"Label"] = pipeline.predict(test['Text'])

In [12]:
test[['ID','Label']]

Unnamed: 0,ID,Label
0,ID_ADHEtjTi,RELIGION
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,RELATIONSHIPS
3,ID_AUKYBbIM,POLITICS
4,ID_AZnsVPEi,EDUCATION
...,...,...
615,ID_zdpOUWyJ,POLITICS
616,ID_zhnOomuu,RELATIONSHIPS
617,ID_zmWHvBJb,LAW/ORDER
618,ID_zphjdFIb,RELATIONSHIPS


In [13]:
#test[['ID','Label']].to_csv("submission.csv", index=False) 

In [14]:
combiner = pd.read_csv("combiner.csv")
combiner['KNN'] = test['Label']
combiner.to_csv('combiner.csv',index=False)
combiner.head()

Unnamed: 0,ID,Text,SGD,MultiNB,Logistic Regression,KNN
0,ID_ADHEtjTi,Abambo odzikhweza akuchuluka Kafukufuku wa ap...,SOCIAL ISSUES,SOCIAL ISSUES,SOCIAL ISSUES,RELIGION
1,ID_AHfJktdQ,Ambuye Ziyaye Ayamikira Aphunzitsi a Tilitonse...,RELIGION,RELIGION,RELIGION,RELIGION
2,ID_AUJIHpZr,Anatcheleza: Akundiopseza a gogo wanga Akundi...,RELATIONSHIPS,RELATIONSHIPS,RELATIONSHIPS,RELATIONSHIPS
3,ID_AUKYBbIM,Ulova wafika posauzana Adatenga digiri ya uph...,POLITICS,LAW/ORDER,SOCIAL ISSUES,POLITICS
4,ID_AZnsVPEi,"Dzombe kukoma, koma Kuyambira makedzana, pant...",HEALTH,HEALTH,HEALTH,EDUCATION


#### Submitted to website and got accuracy of 57.7%