# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
import random
import re
import time

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [3]:
import imblearn
from imblearn.over_sampling import SMOTE,RandomOverSampler

In [4]:
nlp_spacy = spacy.load('en_core_web_sm')

#Storing English Stop Words in a List
english_stop_words = nlp_spacy.Defaults.stop_words
print("English Stop Words-",len(english_stop_words))

English Stop Words- 326


# Reading Datasets

In [5]:
train_df = pd.read_csv("Train.csv")
print("No of training comments =",train_df.shape[0])

No of training comments = 1436


In [6]:
print("Distribution of Labels in Training Dataset\n")
print(train_df.Label.value_counts())

Distribution of Labels in Training Dataset

POLITICS                279
SOCIAL                  152
RELIGION                147
LAW/ORDER               136
SOCIAL ISSUES           134
HEALTH                  127
ECONOMY                  86
FARMING                  78
SPORTS                   49
EDUCATION                43
RELATIONSHIPS            39
WILDLIFE/ENVIRONMENT     36
OPINION/ESSAY            26
LOCALCHIEFS              25
CULTURE                  23
WITCHCRAFT               16
MUSIC                    15
TRANSPORT                11
ARTS AND CRAFTS           7
FLOODING                  7
Name: Label, dtype: int64


In [7]:
test_df = pd.read_csv("Test.csv")
print("No of test comments =",test_df.shape[0])

No of test comments = 620


In [8]:
# Loading 1000 common Chichewa words

stop_df = pd.read_csv("stopwords.csv")
chichewa_1000_words = dict(zip(stop_df["Chichewa"], stop_df["in English"]))

In [9]:
print("Some examples of chichewa words and their english meanings")
random.sample(list(chichewa_1000_words.items()),5)

Some examples of chichewa words and their english meanings


[('wotentha', 'hot'),
 ('kuphunzira', 'learn'),
 ('thanthwe', 'rock'),
 ('izi', 'this'),
 ('nyimbo', 'song')]

In [10]:
sample_df = pd.read_csv('SampleSubmission.csv')

# Pre-Processing

In [11]:
# Lower Cased, keep only ASCII characters, tokenized, remove punctuation, remove stop words (flag used), removed digits

def preprocess(text, remove_stop=False):
    
    #Convert to lower case
    text_2 = text.lower()

    # keep only ascii characters
    text_3 = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", text_2)

    spacy_object = nlp_spacy(text_3)
    sentences = list(spacy_object.sents) 
    tokens = []

    for sentence in sentences:
        for token in sentence:
            if not(token.is_punct):  #Remove Punctuations
                if remove_stop and token.text in chichewa_1000_words.keys(): #Check if word is in the list of 1000 common Chichewa words 
                    if not (chichewa_1000_words[token.text] in english_stop_words): #If word is a stop word, remove it.
                        tokens.append(token.text)  
                else:
                    tokens.append(token.text)

    tokenized = " ".join(tokens) #Join all tokens as a string.
    
    result = ''.join([i for i in tokenized if not i.isdigit()]) #Remove all digits
  
    return result

In [12]:
train_df_clean = train_df.copy()
for i in train_df_clean.index:
    train_df_clean.loc[i,"Text"] = preprocess(train_df_clean.loc[i,"Text"])

In [13]:
test_df_clean = test_df.copy()
for i in test_df_clean.index:
    test_df_clean.loc[i,"Text"] = preprocess(test_df_clean.loc[i,"Text"])

In [14]:
train_df_no_stop = train_df.copy()
for i in train_df_no_stop.index:
    train_df_no_stop.loc[i,"Text"] = preprocess(train_df_no_stop.loc[i,"Text"], remove_stop=True)

In [15]:
test_df_no_stop = test_df.copy()
for i in test_df_no_stop.index:
    test_df_no_stop.loc[i,"Text"] = preprocess(test_df_no_stop.loc[i,"Text"], remove_stop=True)

In [16]:
print("Example Raw Data-\n")
print(train_df.loc[0,"Text"][0:200])
print("\nExample Processed Data-\n")
print(train_df_clean.loc[0,"Text"][0:200])
print("\nExample Processed Data without stop words-\n")
print(train_df_no_stop.loc[0,"Text"][0:200])

Example Raw Data-

 Mwangonde: Khansala wachinyamata Akamati achinyamata ndi atsogoleri a mawa, ambiri amaganiza kuti izi ndi nkhambakamwa chabe. Koma achinyamata ena, monga Lusubilo Mwangonde, akukwaniritsa akupherezet

Example Processed Data-

  mwangonde   khansala wachinyamata akamati achinyamata ndi atsogoleri a mawa   ambiri amaganiza kuti izi ndi nkhambakamwa chabe   koma achinyamata ena   monga lusubilo mwangonde   akukwaniritsa akuph

Example Processed Data without stop words-

  mwangonde   khansala wachinyamata akamati achinyamata atsogoleri mawa   ambiri amaganiza nkhambakamwa chabe   achinyamata ena   monga lusubilo mwangonde   akukwaniritsa akupherezetsa mawuwa osati po


# Bagging Model (Score on website - 0.5870967741935483)

In [17]:
X_train = train_df_clean.Text
y_train = train_df_clean.Label

In [18]:
model_1 = imblearn.pipeline.Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', BaggingClassifier(KNeighborsClassifier())),
                   ])

In [19]:
scores = cross_val_score(model_1, X_train, y_train, cv=5, scoring='accuracy')
print(f"5-fold cross validation accuracy = {scores.mean()}")

5-fold cross validation accuracy = 0.5633734998064266


In [20]:
model_1.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 BaggingClassifier(base_estimator=KNeighborsClassifier()))])

In [21]:
submission_dict = {"ID":[],"Label":[]}
for i in sample_df.ID:

    submission_dict["ID"].append(i)
    
    text = test_df_clean.loc[test_df_clean['ID'] == i,"Text"].values[0]
    text_input = [text]
    pred = model_1.predict(text_input)
    
    submission_dict["Label"].append(pred.item())

submission_df_1 = pd.DataFrame(submission_dict)

In [22]:
submission_df_1.head()

Unnamed: 0,ID,Label
0,ID_sQaPRMWO,LAW/ORDER
1,ID_TanclvfR,RELIGION
2,ID_CNbveyvk,POLITICS
3,ID_MclKMhyP,SOCIAL ISSUES
4,ID_rNrmXOGD,ECONOMY


In [23]:
#submission_df_1.to_csv('Bagging_Predicted_Submission.csv', index = False)

# Bagging Model (with Oversampling) (Score on website - 0.4967741935483871)

In [24]:
X_train = train_df_clean.Text
y_train = train_df_clean.Label

In [25]:
model_2 = imblearn.pipeline.Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('ros', RandomOverSampler()),
                    ('oversampler', SMOTE()),
                    ('clf', BaggingClassifier(KNeighborsClassifier())),
                   ])

In [26]:
scores = cross_val_score(model_2, X_train, y_train, cv=5, scoring='accuracy')
print(f"5-fold cross validation accuracy = {scores.mean()}")

5-fold cross validation accuracy = 0.5056039488966318


In [27]:
model_2.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('ros', RandomOverSampler()), ('oversampler', SMOTE()),
                ('clf',
                 BaggingClassifier(base_estimator=KNeighborsClassifier()))])

In [28]:
submission_dict = {"ID":[],"Label":[]}
for i in sample_df.ID:

    submission_dict["ID"].append(i)
    
    text = test_df_clean.loc[test_df_clean['ID'] == i,"Text"].values[0]
    text_input = [text]
    pred = model_2.predict(text_input)
    
    submission_dict["Label"].append(pred.item())

submission_df_2 = pd.DataFrame(submission_dict)

In [29]:
submission_df_2.head()

Unnamed: 0,ID,Label
0,ID_sQaPRMWO,LAW/ORDER
1,ID_TanclvfR,RELIGION
2,ID_CNbveyvk,SOCIAL ISSUES
3,ID_MclKMhyP,SOCIAL ISSUES
4,ID_rNrmXOGD,ECONOMY


In [30]:
#submission_df_2.to_csv('Bagging_Predicted_Submission_2.csv', index = False)

# Bagging Model (without Stop Words) (Score on website - 0.6064516129032258)

In [31]:
X_train = train_df_no_stop.Text
y_train = train_df_no_stop.Label

In [32]:
model_3 = imblearn.pipeline.Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', BaggingClassifier(KNeighborsClassifier())),
                   ])

In [33]:
scores = cross_val_score(model_3, X_train, y_train, cv=5, scoring='accuracy')
print(f"5-fold cross validation accuracy = {scores.mean()}")

5-fold cross validation accuracy = 0.5773011033681765


In [34]:
model_3.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 BaggingClassifier(base_estimator=KNeighborsClassifier()))])

In [35]:
submission_dict = {"ID":[],"Label":[]}
for i in sample_df.ID:

    submission_dict["ID"].append(i)
    
    text = test_df_no_stop.loc[test_df_no_stop['ID'] == i,"Text"].values[0]
    text_input = [text]
    pred = model_3.predict(text_input)
    
    submission_dict["Label"].append(pred.item())

submission_df_3 = pd.DataFrame(submission_dict)

In [36]:
submission_df_3.head()

Unnamed: 0,ID,Label
0,ID_sQaPRMWO,LAW/ORDER
1,ID_TanclvfR,RELIGION
2,ID_CNbveyvk,POLITICS
3,ID_MclKMhyP,SOCIAL ISSUES
4,ID_rNrmXOGD,ECONOMY


In [37]:
#submission_df_3.to_csv('Bagging_Predicted_Submission_3.csv', index = False)
submission_df_3.to_csv('Bagging_Predicted_Submission.csv', index = False)

# Combiner

In [39]:
combiner = pd.read_csv("combiner.csv")
combiner['Bagging'] = submission_df_3['Label']
combiner.to_csv('combiner.csv', index = False)
combiner.head()