In [55]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from collections import Counter
import imblearn
from imblearn.over_sampling import SMOTE,RandomOverSampler

from nltk import word_tokenize
from time import time
import pandas as pd
import re
import numpy as np
import gensim
import string

In [4]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Data Preprocessing

In [101]:
training_set = pd.read_csv("Train.csv")
training_set.sample(5)

Unnamed: 0,ID,Text,Label
330,ID_MhRLVrCv,Sabuside yayamba ndi njengunje Pulogalamu ya ...,FARMING
1271,ID_uaIRnKEg,Woganiziridwa kuba khanda anjatidwa ku Mzuzu ...,LAW/ORDER
753,ID_cGoUjmkl,Boma Lati Lilemba Ntchito Madotolo Ochuluka Bo...,HEALTH
908,ID_hqiqQOJU,HRDC Yauza A Malawi Akhale Tcheru pa Chisankho...,POLITICS
1235,ID_tctVZmuZ,Papa Akuyembekezeka kuchita Misa Yopemphelera ...,RELIGION


In [102]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1436 non-null   object
 1   Text    1436 non-null   object
 2   Label   1436 non-null   object
dtypes: object(3)
memory usage: 33.8+ KB


In [103]:
test_set = pd.read_csv("Test.csv")
test_set.sample(5)

Unnamed: 0,ID,Text
587,ID_xOPvEpYq,"Mutharika apempha bata Peter Mutharika, yemwe..."
325,ID_cVAHPaJJ,Tikolole motani chimanga? Alimi mmadera a dzi...
195,ID_OlMYVZHE,Papa Wakhazikitsa Masiku Apadera Oteteza Chile...
48,ID_DvKpsrBg,Anatchezera Akunyengana ndi mnzanga Zikomo An...
468,ID_nwWsswqZ,Sukulu ya Maranatha Yatseka Nthambi Zake Zitat...


In [104]:
punctuations = string.punctuation + "’¶•@°©®™"
stop_words = pd.read_csv("stopwords.csv", usecols=["Chichewa"])
sw = set(stop_words.values[:100].flatten().tolist())

In [105]:
def preprocess_text(text):
    """
    @param text string
    @return text string
    
    This function preprocess a given raw text by normalizing it to lowercase removing the stop words,
    punctuations and lemmatization
    """
        
    #string to lowercase
    txt = text.lower()
    
    # keep only ascii characters
    txt = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", txt)
    
    # punctuation removal and map it to space
    translator = str.maketrans(punctuations, " "*len(punctuations))
    s = txt.translate(translator)
    
    # remove digits 
    no_digits = ''.join([i for i in s if not i.isdigit()])
    cleaner = " ".join(no_digits.split())
    
    # tokenize words and removing stop words 
    word_tokens = word_tokenize(cleaner)
    filtered_sentence = [w for w in word_tokens if len(w)>2 and w not in sw]
    filtered_sentence = " ".join(filtered_sentence)
    
    return filtered_sentence

In [106]:
# testing function
sample_text = training_set.Text[7]
print("RAW text before preprocessing :\n")
print(sample_text)
print("\n-------------------------------\n")
print("Processed text after preprocessing :\n")
print(preprocess_text(sample_text))

RAW text before preprocessing :

 Mayi wamalonda avulazidwa ku bt Titha Masamba, wa zaka 31, akumva ululu wadzaoneni. Kuti ayende akuyenera agwirire ndodo; sangagone chafufumimba koma chammbali kapena chagada; moyo wamtendere watha.
 Akuti adamuphera tsogolo lake: Masamba kumva ululu kunyumba kwake Akuti izitu zili chonchi chifukwa cha bala lomwe lili pabondo lake la kumanja lomwe lidasokedwa kuchipatala pambuyo pokhapidwa ndi chikwanje.
  Chisale watuluka nkumangidwanso  Sipakala waimitsa Nyumba ya Malamulo  Pa Wenela pasintha zedi Ulendo wa mayiyu wokagulitsa mandasi pa 7 July ndi womwe udabweretsa mavutowa pomwe anthu ena, omwe akuwaganizira kuti ogwira ntchito kukhonsolo ya mzinda wa Blantyre (city rangers) amene adamuchita chiwembu pomulanda malonda ake komanso kumuvulaza ndi chikwanje.
 Masamba akuti atangomwalira amuna ake mu 2007, iye adayamba geni yogulitsa mandasi kuti azisamalira banja lake la ana awiri. Malo amene amagulitsira malonda akewo akuti ndi ku Cold Storage pafu

In [107]:
# Applying the preprocessing function through all the data
training_set['clean_text'] = training_set.Text.apply(preprocess_text)
test_set['clean_text'] = test_set.Text.apply(preprocess_text)

In [109]:
training_set['clean_text'] = training_set['clean_text'].str.replace('ndi', '').replace('kuti', '')
test_set['clean_text'] = test_set['clean_text'].str.replace('ndi', '').replace('kuti', '')

In [110]:
training_set.sample(5)

Unnamed: 0,ID,Text,Label,clean_text
427,ID_QNAHlieY,Alimbana ndi ntchemberezandonda ku Zomba Undu...,FARMING,alimbana ntchemberezandonda zomba unduna zamal...
1189,ID_sIaymPRo,Bande Walimbikitsa Aphungu Anzake Kuti Agwire ...,POLITICS,bande walimbikitsa aphungu anzake agwire ntchi...
1008,ID_ltyEVrOY,World Vision Donates Buckets to Mangochi Polic...,SOCIAL,world vision donates buckets mangochi police m...
522,ID_TUIsSydd,Bambo Wafa Atawombedwa ndi Galimoto ku Kasungu...,SOCIAL,wafa atawombedwa galimoto kasungu wina zaka za...
296,ID_LRqnhrXL,Ndalama zakale zitha pa 23 May Banki yaikulu ...,ECONOMY,ndalama zakale zitha may banki yakumbutsa maba...


# Decision Tree

## Grid Search without oversampling

In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", DecisionTreeClassifier()),
    ]
)

In [57]:
parameters = {
    'classifier__max_leaf_nodes': list(range(2, 100)),
    'classifier__min_samples_split': [2, 3, 4],
    'classifier__max_depth' : [10, 20, 40, 60, None]
}

In [58]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [59]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'classifier']
parameters:
{'classifier__max_leaf_nodes': [2,
                                3,
                                4,
                                5,
                                6,
                                7,
                                8,
                                9,
                                10,
                                11,
                                12,
                                13,
                                14,
                                15,
                                16,
                                17,
                                18,
                                19,
                                20,
                                21,
                                22,
                                23,
                                24,
                                25,
                                26,
                            

## Train best model Without Oversampling

In [92]:
%%time
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", DecisionTreeClassifier(max_leaf_nodes=34, min_samples_split=3)),
    ]
)

CPU times: user 200 µs, sys: 8 µs, total: 208 µs
Wall time: 217 µs


In [85]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.45833333 0.48780488 0.50174216 0.51567944 0.49825784]


In [86]:
print(f"Final score is {scores.mean()}")

Final score is 0.4923635307781649


## Grid Search and Train With Oversampling

In [96]:
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", DecisionTreeClassifier())
    ]
)
parameters = {
    'classifier__max_leaf_nodes': list(range(2, 100)),
    'classifier__min_samples_split': [2, 3, 4]
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [97]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'ros', 'oversampler', 'classifier']
parameters:
{'classifier__max_leaf_nodes': [2,
                                3,
                                4,
                                5,
                                6,
                                7,
                                8,
                                9,
                                10,
                                11,
                                12,
                                13,
                                14,
                                15,
                                16,
                                17,
                                18,
                                19,
                                20,
                                21,
                                22,
                                23,
                                24,
                                25,
                                26,
      

In [98]:
%%time
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", DecisionTreeClassifier(max_leaf_nodes=75, min_samples_split=3)),
    ]
)

CPU times: user 2.02 ms, sys: 0 ns, total: 2.02 ms
Wall time: 19.1 ms


In [99]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.36805556 0.36933798 0.39372822 0.48432056 0.40418118]


In [100]:
print(f"Final score is {scores.mean()}")

Final score is 0.4039246999612853


# Random Forest

## Grid Search without oversampling

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", RandomForestClassifier()),
    ]
)

In [22]:
parameters = {
    'classifier__bootstrap': [True, False],
    'classifier__max_depth': [10, 20, 40, 60, None],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__n_estimators': [200, 500, 1000, 1500]
}

In [24]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [25]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'classifier']
parameters:
{'classifier__bootstrap': [True, False],
 'classifier__max_depth': [10, 20, 40, 60, None],
 'classifier__max_features': ['auto', 'sqrt'],
 'classifier__n_estimators': [200, 500, 1000, 1500]}
Fitting 5 folds for each of 80 candidates, totalling 400 fits




done in 4931.146s

Best score: 0.581
Best parameters set:
	classifier__bootstrap: False
	classifier__max_depth: 40
	classifier__max_features: 'sqrt'
	classifier__n_estimators: 200


## Train best model Without Oversampling

In [114]:
%%time
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", RandomForestClassifier(n_estimators=200, max_features='sqrt', max_depth=40, bootstrap=False)),
    ]
)

CPU times: user 253 µs, sys: 0 ns, total: 253 µs
Wall time: 260 µs


In [115]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.58680556 0.57491289 0.61324042 0.57491289 0.55052265]


In [116]:
print(f"Final score is {scores.mean()}")

Final score is 0.5800788811459543


## Grid Search and Train With Oversampling

In [40]:
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", RandomForestClassifier())
    ]
)
parameters = {
    'classifier__bootstrap': [True, False],
    'classifier__max_depth': [10, 20, 40, 60, None],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__n_estimators': [100, 200, 500, 1000]
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [41]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'ros', 'oversampler', 'classifier']
parameters:
{'classifier__bootstrap': [True, False],
 'classifier__max_depth': [10, 20, 40, 60, None],
 'classifier__max_features': ['auto', 'sqrt'],
 'classifier__n_estimators': [100, 200, 500, 1000]}
Fitting 5 folds for each of 80 candidates, totalling 400 fits




done in 4760.212s

Best score: 0.610
Best parameters set:
	classifier__bootstrap: True
	classifier__max_depth: 40
	classifier__max_features: 'sqrt'
	classifier__n_estimators: 500


In [120]:
%%time
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", RandomForestClassifier(n_estimators=500, max_features='sqrt', max_depth=40, bootstrap=True)),
        #("classifier", RandomForestClassifier())
    ]
)

CPU times: user 509 µs, sys: 0 ns, total: 509 µs
Wall time: 539 µs


In [121]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.59722222 0.57491289 0.64808362 0.6271777  0.57491289]


In [122]:
print(f"Final score is {scores.mean()}")

Final score is 0.604461866047232


# Predict test data

In [123]:
pipeline.fit( training_set['clean_text'], training_set['Label'])
test_set.loc[:,"Label"] = pipeline.predict(test_set['Text'])

In [124]:
test_set[['ID','Label']]

Unnamed: 0,ID,Label
0,ID_ADHEtjTi,SOCIAL ISSUES
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,RELATIONSHIPS
3,ID_AUKYBbIM,LAW/ORDER
4,ID_AZnsVPEi,FARMING
...,...,...
615,ID_zdpOUWyJ,SOCIAL
616,ID_zhnOomuu,RELATIONSHIPS
617,ID_zmWHvBJb,LAW/ORDER
618,ID_zphjdFIb,SOCIAL ISSUES


In [126]:
test_set[['ID','Label']].to_csv("submission.csv", index=False) 

Best Score on website: 0.61935 \
By Random Forest with Oversampling