## Linear SVM Model w/ SMOTE 63.87% Accuracy on Website

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn import svm
from sklearn.model_selection import cross_val_score
import pandas as pd
import imblearn
from imblearn.over_sampling import SMOTE,RandomOverSampler

from nltk import word_tokenize
from time import time
import pandas as pd
import re
import numpy as np
import gensim
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/ktyser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ktyser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
train = pd.read_csv("Train-no-stopwords.csv")
test = pd.read_csv("Test.csv")

In [3]:
punctuations = string.punctuation + "’¶•@°©®™"

def preprocess_text(text):   
    #string to lowercase
    txt = text.lower()
    
    # keep only ascii characters
    txt = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", txt)
    
    # punctuation removal and map it to space
    translator = str.maketrans(punctuations, " "*len(punctuations))
    s = txt.translate(translator)
    
    # remove digits 
    no_digits = ''.join([i for i in s if not i.isdigit()])
    cleaner = " ".join(no_digits.split())
    
    # tokenize words and removing stop words 
    word_tokens = word_tokenize(cleaner)
    filtered_sentence = [w for w in word_tokens if len(w)>2 ]
    filtered_sentence = " ".join(filtered_sentence)
    
    return filtered_sentence

In [4]:
# preprocessing the text
train['clean_text'] = train.Text.apply(preprocess_text)
test['clean_text'] = test.Text.apply(preprocess_text)

In [5]:
# defining pipelines
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", svm.SVC()),
    ]
)

pipeline2 = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", svm.LinearSVC()),
    ]
)

In [6]:
# evaluating models
scores = cross_val_score(pipeline, train['clean_text'], train['Label'], cv=5, scoring='precision_micro')
print(scores)
scores2 = cross_val_score(pipeline2, train['clean_text'], train['Label'], cv=5, scoring='precision_micro')
print(scores2)

[0.55208333 0.57142857 0.63414634 0.55052265 0.51567944]
[0.63194444 0.6271777  0.67595819 0.66202091 0.65156794]


In [7]:
print(f"Final score is {scores.mean()}")
print(f"Final score is {scores2.mean()}")

Final score is 0.5647720673635307
Final score is 0.6497338366240806


In [8]:
# trying SMOTE sampling
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", svm.LinearSVC()),
    ]
)

In [9]:
scores = cross_val_score(pipeline, train['clean_text'], train['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.63888889 0.63763066 0.67595819 0.65156794 0.6445993 ]


In [10]:
print(f"Final score is {scores.mean()}")


Final score is 0.6497289972899729


In [11]:
# making predictions with the best model
pipeline.fit( train['clean_text'], train['Label'])
test.loc[:,"Label"] = pipeline.predict(test['Text'])

In [12]:
test[['ID','Label']]

Unnamed: 0,ID,Label
0,ID_ADHEtjTi,SOCIAL ISSUES
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,RELATIONSHIPS
3,ID_AUKYBbIM,SOCIAL ISSUES
4,ID_AZnsVPEi,HEALTH
...,...,...
615,ID_zdpOUWyJ,LAW/ORDER
616,ID_zhnOomuu,RELATIONSHIPS
617,ID_zmWHvBJb,LAW/ORDER
618,ID_zphjdFIb,SOCIAL ISSUES


In [13]:
#test[['ID','Label']].to_csv("submission.csv", index=False) 

In [14]:
combiner = pd.read_csv("combiner.csv")
combiner['SVM'] = test['Label']
combiner.to_csv('combiner.csv',index=False)
combiner.head()

Unnamed: 0,ID,Text,SGD,MultiNB,Logistic Regression,KNN,SVM
0,ID_ADHEtjTi,Abambo odzikhweza akuchuluka Kafukufuku wa ap...,SOCIAL ISSUES,SOCIAL ISSUES,SOCIAL ISSUES,RELIGION,SOCIAL ISSUES
1,ID_AHfJktdQ,Ambuye Ziyaye Ayamikira Aphunzitsi a Tilitonse...,RELIGION,RELIGION,RELIGION,RELIGION,RELIGION
2,ID_AUJIHpZr,Anatcheleza: Akundiopseza a gogo wanga Akundi...,RELATIONSHIPS,RELATIONSHIPS,RELATIONSHIPS,RELATIONSHIPS,RELATIONSHIPS
3,ID_AUKYBbIM,Ulova wafika posauzana Adatenga digiri ya uph...,POLITICS,LAW/ORDER,SOCIAL ISSUES,POLITICS,SOCIAL ISSUES
4,ID_AZnsVPEi,"Dzombe kukoma, koma Kuyambira makedzana, pant...",HEALTH,HEALTH,HEALTH,EDUCATION,HEALTH
