In [1]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from collections import Counter
import imblearn
from imblearn.over_sampling import SMOTE,RandomOverSampler

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from time import time
import pandas as pd
import re
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\minhc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\minhc\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
train = pd.read_csv("Train.csv")
train.sample(5)

Unnamed: 0,ID,Text,Label
1413,ID_zEqcsLPA,Mutharika asankha nduna Pangotha maola sipika...,POLITICS
1393,ID_ydgaxsDF,Kulimbana ndi alaliki Ndidakhala pa Wenela ts...,RELIGION
667,ID_ZNamCvoS,Akayidi Atsopano Adziyezedwa Coronavirus-MPS N...,HEALTH
590,ID_VxGQuOxk,Radio Maria Itsekera Nyengo ya Mariatona Wolem...,RELIGION
30,ID_BesFEpeJ,Tinkaimba limodzi kwaya Mdalitso suoneka paku...,SOCIAL ISSUES


In [3]:
train_no_stop = pd.read_csv("Train-no-stopwords.csv")
train_no_stop.sample(5)

Unnamed: 0,ID,Text,Label
1117,ID_pnMsdnVA,lamulo latsopano la zipani alikambirana lino...,POLITICS
1414,ID_zHBsAcgx,dpp yalonjeza kampeni ya bata chipani cha demo...,POLITICS
140,ID_FXJDGOak,zipani zikufuna machawi chisankho zipani zan...,POLITICS
45,ID_COPLUDra,msoliza kayuni share spoils chisale watulu...,SOCIAL ISSUES
127,ID_EtUHjqWm,mwana wa zaka 13 mavuto osakata ikakuona lit...,SOCIAL ISSUES


In [4]:
test = pd.read_csv("Test.csv")
test.sample(5)

Unnamed: 0,ID,Text
443,ID_mKaODodw,Adzudzula MEC Katswiri wa ndale wa sukulu yau...
130,ID_JqVQfLOR,Mabungwe ati boma likonze zinthu mu 2015 Kuph...
188,ID_OSqLregQ,Adzudzula Admarc Mafumu ndi alimi ena mzigawo...
435,ID_lPYiWTXw,Taphulanji mzaka 52? Anthu komanso atsogoleri...
504,ID_rIkPdtZy,Za mafumu mmatuni ndi mmizinda Kalata yangayi...


In [5]:
wordnet = WordNetLemmatizer()

def preprocess(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [wordnet.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

In [6]:
train['Text'] = train['Text'].apply(preprocess)
train_no_stop['Text'] = train_no_stop['Text'].apply(preprocess)
test['Text'] = test['Text'].apply(preprocess)
print(train.head())
print(train_no_stop.head())
print(test.head())


            ID                                               Text      Label
0  ID_AASHwXxg  mwangonde khansala wachinyamata akamati achiny...   POLITICS
1  ID_AGoFySzn  mcp siidakhutire ndi kalembera chipani cha mal...   POLITICS
2  ID_AGrrkBGP  bungwe la manepo lapempha boma liganizire anth...     HEALTH
3  ID_AIJeigeG  ndale zogawanitsa miyambo zanyanya si zachilen...   POLITICS
4  ID_APMprMbV  nanga wapolisi ataphofomoka masiku ano sichikh...  LAW/ORDER
            ID                                               Text      Label
0  ID_AASHwXxg  mwangonde khansala wachinyamata akamati achiny...   POLITICS
1  ID_AGoFySzn  mcp siidakhutire kalembera chipani cha malawi ...   POLITICS
2  ID_AGrrkBGP  bungwe la manepo lapempha boma liganizire achi...     HEALTH
3  ID_AIJeigeG  ndale zogawanitsa miyambo zanyanya si zachilen...   POLITICS
4  ID_APMprMbV  nanga wapolisi ataphofomoka masiku ano sichikh...  LAW/ORDER
            ID                                               Text
0  ID_ADHE

### With stop words ###

In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train['Text']).toarray()
training = pd.DataFrame(X, columns=vectorizer.get_feature_names())
print(training.shape)

X_test = vectorizer.transform(test['Text']).toarray()
test_new = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())
print(test_new.shape)

(1436, 49582)
(620, 49582)


In [8]:
X = training
y = train['Label']
label_encoder = LabelEncoder()
y_label = label_encoder.fit_transform(y)
smote = SMOTE()


In [9]:
X, y_label = smote.fit_resample(X, y_label)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size = 0.1, random_state = 42)

sgd = SGDClassifier(loss='hinge', max_iter = 20)
sgd.fit(X_train, y_train)

predictions = sgd.predict(X_test)
print("Train Accuracy Score:", sgd.score(X_train, y_train))
print("Test Accuracy Score:", accuracy_score(y_test, predictions))

Train Accuracy Score: 0.9990043807248108
Test Accuracy Score: 0.96415770609319


In [11]:
test.loc[:,"Label"] = label_encoder.inverse_transform(sgd.predict(test_new))
test[['ID','Label']].to_csv("SGD_with_StopWords_submission.csv", index=False) 

### Score on website: 0.632258064516129 ###

### Without stop words ###

In [9]:
vectorizer = TfidfVectorizer()
X_no_stop = vectorizer.fit_transform(train_no_stop['Text']).toarray()
training_no_stop = pd.DataFrame(X_no_stop, columns=vectorizer.get_feature_names())
print(training_no_stop.shape)

X_test = vectorizer.transform(test['Text']).toarray()
test_new = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())
print(test_new.shape)

X_no_stop = training_no_stop
y = train['Label']

label_encoder = LabelEncoder()
y_label = label_encoder.fit_transform(y)
smote = SMOTE()

(1436, 49515)
(620, 49515)


In [10]:
X, y_label = smote.fit_resample(X_no_stop, y_label)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size = 0.1, random_state = 42)

sgd = SGDClassifier(loss='hinge', max_iter = 20)
sgd.fit(X_train, y_train)

predictions = sgd.predict(X_test)
print("Train Accuracy Score:", sgd.score(X_train, y_train))
print("Test Accuracy Score:", accuracy_score(y_test, predictions))

Train Accuracy Score: 0.9986061330147352
Test Accuracy Score: 0.967741935483871


In [12]:
test.loc[:,"Label"] = label_encoder.inverse_transform(sgd.predict(test_new))
test[['ID','Label']].to_csv("SGD_without_StopWords_submission.csv", index=False) 

### Score on website: 0.6161290322580645 ###

In [34]:
combiner = pd.read_csv("combiner.csv")
combiner['SGD'] = test['Label']
combiner.to_csv('combiner.csv', index = False)

In [35]:
combiner.head()

Unnamed: 0,ID,Text,SGD
0,ID_ADHEtjTi,Abambo odzikhweza akuchuluka Kafukufuku wa ap...,SOCIAL ISSUES
1,ID_AHfJktdQ,Ambuye Ziyaye Ayamikira Aphunzitsi a Tilitonse...,RELIGION
2,ID_AUJIHpZr,Anatcheleza: Akundiopseza a gogo wanga Akundi...,RELATIONSHIPS
3,ID_AUKYBbIM,Ulova wafika posauzana Adatenga digiri ya uph...,POLITICS
4,ID_AZnsVPEi,"Dzombe kukoma, koma Kuyambira makedzana, pant...",HEALTH
