In [24]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from collections import Counter
import imblearn
from imblearn.over_sampling import SMOTE,RandomOverSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

from nltk import word_tokenize
from time import time
import pandas as pd
import re
import numpy as np
import gensim
import string
from gensim.models import Word2Vec, Phrases

In [4]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Data Preprocessing

In [5]:
training_set = pd.read_csv("Train.csv")
training_set.sample(5)

Unnamed: 0,ID,Text,Label
606,ID_WnLgBJdY,WESM Ikuthandiza Alimi Kupanga Manyowa Bungwe ...,FARMING
223,ID_IVUpzLZS,YCW Imangira Nyumba Mayi Wachikulire Wolemba: ...,SOCIAL
1154,ID_rJHYjLHj,Kasambara: Akhazikitsa chimbale mwezi uno Mar...,MUSIC
1115,ID_pmLnsGZZ,Ndale pamaliro nchitonzo Katswiri pa zachikha...,POLITICS
77,ID_DQZzdkEq,Awiri Afa Pochokera Ku Interview Ya Zaumoyo An...,SOCIAL


In [6]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1436 non-null   object
 1   Text    1436 non-null   object
 2   Label   1436 non-null   object
dtypes: object(3)
memory usage: 33.8+ KB


In [7]:
test_set = pd.read_csv("Test.csv")
test_set.sample(5)

Unnamed: 0,ID,Text
346,ID_eCJzliVf,Papa Wati Masewero Olimbitsa Thupi Amabweretsa...
602,ID_yPLbZVkn,Arkidayosizi ya Blantyre Yayamikira Umodzi Pak...
529,ID_sluGXbji,TB ya kumsana imapha ziwalo Dokotala wothandi...
318,ID_bMpMazLm,Rasta akaseweza zaka 6 Mlandu woba njinga kom...
496,ID_qPSOUOBY,"Anthu 29 Anjatidwa ku Mchesi, Biwi Apolisi mu ..."


In [8]:
punctuations = string.punctuation + "’¶•@°©®™"
stop_words = pd.read_csv("stopwords.csv", usecols=["Chichewa"])
sw = set(stop_words.values[:100].flatten().tolist())

In [9]:
def preprocess_text(text):
    """
    @param text string
    @return text string
    
    This function preprocess a given raw text by normalizing it to lowercase removing the stop words,
    punctuations and lemmatization
    """
        
    #string to lowercase
    txt = text.lower()
    
    # keep only ascii characters
    txt = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", txt)
    
    # punctuation removal and map it to space
    translator = str.maketrans(punctuations, " "*len(punctuations))
    s = txt.translate(translator)
    
    # remove digits 
    no_digits = ''.join([i for i in s if not i.isdigit()])
    cleaner = " ".join(no_digits.split())
    
    # tokenize words and removing stop words 
    word_tokens = word_tokenize(cleaner)
    filtered_sentence = [w for w in word_tokens if len(w)>2 and w not in sw]
    filtered_sentence = " ".join(filtered_sentence)
    
    return filtered_sentence

In [10]:
# testing function
sample_text = training_set.Text[7]
print("RAW text before preprocessing :\n")
print(sample_text)
print("\n-------------------------------\n")
print("Processed text after preprocessing :\n")
print(preprocess_text(sample_text))

RAW text before preprocessing :

 Mayi wamalonda avulazidwa ku bt Titha Masamba, wa zaka 31, akumva ululu wadzaoneni. Kuti ayende akuyenera agwirire ndodo; sangagone chafufumimba koma chammbali kapena chagada; moyo wamtendere watha.
 Akuti adamuphera tsogolo lake: Masamba kumva ululu kunyumba kwake Akuti izitu zili chonchi chifukwa cha bala lomwe lili pabondo lake la kumanja lomwe lidasokedwa kuchipatala pambuyo pokhapidwa ndi chikwanje.
  Chisale watuluka nkumangidwanso  Sipakala waimitsa Nyumba ya Malamulo  Pa Wenela pasintha zedi Ulendo wa mayiyu wokagulitsa mandasi pa 7 July ndi womwe udabweretsa mavutowa pomwe anthu ena, omwe akuwaganizira kuti ogwira ntchito kukhonsolo ya mzinda wa Blantyre (city rangers) amene adamuchita chiwembu pomulanda malonda ake komanso kumuvulaza ndi chikwanje.
 Masamba akuti atangomwalira amuna ake mu 2007, iye adayamba geni yogulitsa mandasi kuti azisamalira banja lake la ana awiri. Malo amene amagulitsira malonda akewo akuti ndi ku Cold Storage pafu

In [11]:
# Applying the preprocessing function through all the data
training_set['clean_text'] = training_set.Text.apply(preprocess_text)
test_set['clean_text'] = test_set.Text.apply(preprocess_text)

In [12]:
training_set['clean_text'] = training_set['clean_text'].str.replace('ndi', '').replace('kuti', '')
test_set['clean_text'] = test_set['clean_text'].str.replace('ndi', '').replace('kuti', '')

In [13]:
training_set.sample(5)

Unnamed: 0,ID,Text,Label,clean_text
873,ID_ggGOcZRW,Olowa mdziko mozemba aonjeza mavuto mndende M...,POLITICS,olowa mdziko mozemba aonjeza mavuto mndende ma...
1251,ID_uETOtdkb,Tinkakhala nyumba zoyandikana Mwayi wa banja ...,SOCIAL ISSUES,tinkakhala nyumba zoyakana mwayi banja umapeze...
1108,ID_pdNmeuRK,Joyce Banda Wauza Anthu Aku Zomba Avotere Chak...,POLITICS,joyce banda wauza aku zomba avotere chakwera m...
691,ID_ZtXFlfQC,MSE Ipereka Maphunziro apa Intaneti kwa Atolan...,ECONOMY,mse ipereka maphunziro apa intaneti kwa atolan...
1340,ID_wpLpuNtY,Papa Achita Misa Yokumbukira Ulendo Wake Wokac...,RELIGION,papa achita misa yokumbukira ulendo wokacheza ...


# Random Forest w2v

In [14]:
chichewa_tkns = []
for s in training_set['Text']:
  chichewa_tkns.append(word_tokenize(s))

In [15]:
bigram_transformer = Phrases(chichewa_tkns)

In [16]:
w2vmodel = Word2Vec(bigram_transformer[chichewa_tkns], size = 300, window = 5, min_count = 1, workers = 2)
#model.save('chichewa_w2v.model')



In [19]:
def get_avg_w2v(s):
  vec = 0
  n = 0
  for token in word_tokenize(s):
    if token in w2vmodel:
      vec += w2vmodel.wv[token]
      n += 1
  return vec / n

In [22]:
x_train, x_test, y_train, y_test = train_test_split(training_set['clean_text'].values, training_set['Label'].values, test_size = .15, stratify = training_set['Label'].values)

vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train).toarray()
x_test_vec = vectorizer.transform(x_test).toarray()

x_train_w2v = np.array([get_avg_w2v(x) for x in x_train])
x_test_w2v = np.array([get_avg_w2v(x) for x in x_test])

x_train_w2v = np.hstack([x_train_vec, x_train_w2v])
x_test_w2v = np.hstack([x_test_vec, x_test_w2v])

  """


In [23]:
clf = RandomForestClassifier()
clf.fit(x_train_w2v, y_train)

RandomForestClassifier()

In [25]:
pred = clf.predict(x_test_w2v)
accuracy_score(y_test, pred)

0.46296296296296297

In [26]:
vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(training_set['clean_text'].values).toarray()
x_test_vec = vectorizer.transform(test_set['clean_text'].values).toarray()

x_train_w2v = np.array([get_avg_w2v(x) for x in training_set['clean_text'].values])
x_test_w2v = np.array([get_avg_w2v(x) for x in test_set['clean_text'].values])

x_train_w2v = np.hstack([x_train_vec, x_train_w2v])
x_test_w2v = np.hstack([x_test_vec, x_test_w2v])

clf = RandomForestClassifier()
clf.fit(x_train_w2v, training_set['Label'].values)

  """


RandomForestClassifier()

In [27]:
pred = clf.predict(x_test_w2v)
test_set["Label"] = pred

In [28]:
test_set[['ID','Label']]

Unnamed: 0,ID,Label
0,ID_ADHEtjTi,POLITICS
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,SOCIAL ISSUES
3,ID_AUKYBbIM,POLITICS
4,ID_AZnsVPEi,LAW/ORDER
...,...,...
615,ID_zdpOUWyJ,POLITICS
616,ID_zhnOomuu,SOCIAL ISSUES
617,ID_zmWHvBJb,POLITICS
618,ID_zphjdFIb,SOCIAL ISSUES


In [29]:
test_set[['ID','Label']].to_csv("submission.csv", index=False) 

# Decision Tree tfidf

## Grid Search without oversampling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", DecisionTreeClassifier()),
    ]
)

In [None]:
parameters = {
    'classifier__max_leaf_nodes': list(range(2, 100)),
    'classifier__min_samples_split': [2, 3, 4],
    'classifier__max_depth' : [10, 20, 40, 60, None]
}

In [None]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [None]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'classifier']
parameters:
{'classifier__max_leaf_nodes': [2,
                                3,
                                4,
                                5,
                                6,
                                7,
                                8,
                                9,
                                10,
                                11,
                                12,
                                13,
                                14,
                                15,
                                16,
                                17,
                                18,
                                19,
                                20,
                                21,
                                22,
                                23,
                                24,
                                25,
                                26,
                            

## Train best model Without Oversampling

In [None]:
%%time
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", DecisionTreeClassifier(max_leaf_nodes=34, min_samples_split=3)),
    ]
)

CPU times: user 200 µs, sys: 8 µs, total: 208 µs
Wall time: 217 µs


In [None]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.45833333 0.48780488 0.50174216 0.51567944 0.49825784]


In [None]:
print(f"Final score is {scores.mean()}")

Final score is 0.4923635307781649


## Grid Search and Train With Oversampling

In [None]:
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", DecisionTreeClassifier())
    ]
)
parameters = {
    'classifier__max_leaf_nodes': list(range(2, 100)),
    'classifier__min_samples_split': [2, 3, 4]
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [None]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'ros', 'oversampler', 'classifier']
parameters:
{'classifier__max_leaf_nodes': [2,
                                3,
                                4,
                                5,
                                6,
                                7,
                                8,
                                9,
                                10,
                                11,
                                12,
                                13,
                                14,
                                15,
                                16,
                                17,
                                18,
                                19,
                                20,
                                21,
                                22,
                                23,
                                24,
                                25,
                                26,
      

In [None]:
%%time
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", DecisionTreeClassifier(max_leaf_nodes=75, min_samples_split=3)),
    ]
)

CPU times: user 2.02 ms, sys: 0 ns, total: 2.02 ms
Wall time: 19.1 ms


In [None]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.36805556 0.36933798 0.39372822 0.48432056 0.40418118]


In [None]:
print(f"Final score is {scores.mean()}")

Final score is 0.4039246999612853


# Random Forest tfidf

## Grid Search without oversampling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", RandomForestClassifier()),
    ]
)

In [None]:
parameters = {
    'classifier__bootstrap': [True, False],
    'classifier__max_depth': [10, 20, 40, 60, None],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__n_estimators': [200, 500, 1000, 1500]
}

In [None]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [None]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'classifier']
parameters:
{'classifier__bootstrap': [True, False],
 'classifier__max_depth': [10, 20, 40, 60, None],
 'classifier__max_features': ['auto', 'sqrt'],
 'classifier__n_estimators': [200, 500, 1000, 1500]}
Fitting 5 folds for each of 80 candidates, totalling 400 fits




done in 4931.146s

Best score: 0.581
Best parameters set:
	classifier__bootstrap: False
	classifier__max_depth: 40
	classifier__max_features: 'sqrt'
	classifier__n_estimators: 200


## Train best model Without Oversampling

In [None]:
%%time
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", RandomForestClassifier(n_estimators=200, max_features='sqrt', max_depth=40, bootstrap=False)),
    ]
)

CPU times: user 253 µs, sys: 0 ns, total: 253 µs
Wall time: 260 µs


In [None]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.58680556 0.57491289 0.61324042 0.57491289 0.55052265]


In [None]:
print(f"Final score is {scores.mean()}")

Final score is 0.5800788811459543


## Grid Search and Train With Oversampling

In [None]:
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", RandomForestClassifier())
    ]
)
parameters = {
    'classifier__bootstrap': [True, False],
    'classifier__max_depth': [10, 20, 40, 60, None],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__n_estimators': [100, 200, 500, 1000]
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [None]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(training_set['clean_text'], training_set['Label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'ros', 'oversampler', 'classifier']
parameters:
{'classifier__bootstrap': [True, False],
 'classifier__max_depth': [10, 20, 40, 60, None],
 'classifier__max_features': ['auto', 'sqrt'],
 'classifier__n_estimators': [100, 200, 500, 1000]}
Fitting 5 folds for each of 80 candidates, totalling 400 fits




done in 4760.212s

Best score: 0.610
Best parameters set:
	classifier__bootstrap: True
	classifier__max_depth: 40
	classifier__max_features: 'sqrt'
	classifier__n_estimators: 500


In [None]:
%%time
pipeline = imblearn.pipeline.Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('ros', RandomOverSampler()),
        ('oversampler', SMOTE()),
        ("classifier", RandomForestClassifier(n_estimators=500, max_features='sqrt', max_depth=40, bootstrap=True)),
        #("classifier", RandomForestClassifier())
    ]
)

CPU times: user 509 µs, sys: 0 ns, total: 509 µs
Wall time: 539 µs


In [None]:
scores = cross_val_score(pipeline, training_set['clean_text'], training_set['Label'], cv=5, scoring='precision_micro')
print(scores)

[0.59722222 0.57491289 0.64808362 0.6271777  0.57491289]


In [None]:
print(f"Final score is {scores.mean()}")

Final score is 0.604461866047232


# Predict test data

In [None]:
pipeline.fit( training_set['clean_text'], training_set['Label'])
test_set.loc[:,"Label"] = pipeline.predict(test_set['Text'])

In [None]:
test_set[['ID','Label']]

Unnamed: 0,ID,Label
0,ID_ADHEtjTi,SOCIAL ISSUES
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,RELATIONSHIPS
3,ID_AUKYBbIM,LAW/ORDER
4,ID_AZnsVPEi,FARMING
...,...,...
615,ID_zdpOUWyJ,SOCIAL
616,ID_zhnOomuu,RELATIONSHIPS
617,ID_zmWHvBJb,LAW/ORDER
618,ID_zphjdFIb,SOCIAL ISSUES


In [None]:
test_set[['ID','Label']].to_csv("submission.csv", index=False) 

In [None]:
combiner = pd.read_csv("combiner.csv")
combiner['Random Forest'] = test_set['Label']
combiner.to_csv('combiner.csv',index=False)

Best Score on website: 0.61935 \
By Random Forest with Oversampling