In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tqdm.notebook as tq
import itertools
import pickle
from gensim.models import Word2Vec, Phrases
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
train = pd.read_csv("Train.csv")
#train = train[train["Label"].isin(["POLITICS", "SOCIAL", "RELIGION", "LAW/ORDER", "SOCIAL ISSUES", "HEALTH", "ECONOMY", "FARMING"])]
#train["Label"][train["Label"].isin(["FLOODING", "ARTS AND CRAFTS", "TRANSPORT", "MUSIC", "WITCHCRAFT", "CULTURE", "LOCALCHIEFS", "OPINION/ESSAY"])] = "OTHER"
train_data = train["Text"]
train_labels = train["Label"]


labels = train_labels.unique()
print(train_labels.value_counts() / len(train_labels))

int2Label = {}
for i in range(len(labels)):
    train_labels[train_labels == labels[i]] = i
    int2Label[i] = labels[i]

with open("nationArticleText.txt", "rb") as file:
  nationArticles = pickle.load(file)

chichewa_tkns = []
for s in train['Text'].values.tolist() + nationArticles:
  chichewa_tkns.append(word_tokenize(s))

bigram_transformer = Phrases(chichewa_tkns)
w2vmodel = Word2Vec(bigram_transformer[chichewa_tkns], size = 100, min_count = 1, workers = 4)

POLITICS                0.194290
SOCIAL                  0.105850
RELIGION                0.102368
LAW/ORDER               0.094708
SOCIAL ISSUES           0.093315
HEALTH                  0.088440
ECONOMY                 0.059889
FARMING                 0.054318
SPORTS                  0.034123
EDUCATION               0.029944
RELATIONSHIPS           0.027159
WILDLIFE/ENVIRONMENT    0.025070
OPINION/ESSAY           0.018106
LOCALCHIEFS             0.017409
CULTURE                 0.016017
WITCHCRAFT              0.011142
MUSIC                   0.010446
TRANSPORT               0.007660
ARTS AND CRAFTS         0.004875
FLOODING                0.004875
Name: Label, dtype: float64




In [None]:
class_weights = {}
for i, count in enumerate(train_labels.value_counts()):
  class_weights[i] = 1 / count
class_weights

{0: 0.0035842293906810036,
 1: 0.006578947368421052,
 2: 0.006802721088435374,
 3: 0.007352941176470588,
 4: 0.007462686567164179,
 5: 0.007874015748031496,
 6: 0.011627906976744186,
 7: 0.01282051282051282,
 8: 0.02040816326530612,
 9: 0.023255813953488372,
 10: 0.02564102564102564,
 11: 0.027777777777777776,
 12: 0.038461538461538464,
 13: 0.04,
 14: 0.043478260869565216,
 15: 0.0625,
 16: 0.06666666666666667,
 17: 0.09090909090909091,
 18: 0.14285714285714285,
 19: 0.14285714285714285}

In [None]:
train_data[0]

' Mwangonde: Khansala wachinyamata Akamati achinyamata ndi atsogoleri a mawa, ambiri amaganiza kuti izi ndi nkhambakamwa chabe. Koma achinyamata ena, monga Lusubilo Mwangonde, akukwaniritsa akupherezetsa mawuwa osati pongolota kuti adzakhala, koma kutsogolera kumene chifukwa nthawi yawo yakwana. DAILES BANDA adacheza ndi Mwangonde, khansala wachinyama, yemwe akuimira Jumbo Ward mumzinda wa Mzuzu, motere:  Chisale watuluka nkumangidwanso  Sipakala waimitsa Nyumba ya Malamulo  Pa Wenela pasintha zedi Ali ndi masomphenya: Mwangonde Tikudziweni  Ndine Lusubilo Mwangonde, ndili ndi zaka 27 zakubadwa. Ndinabadwa mbanja la ana asanu ndipo ndine wachinayi kubadwa. Ndimachokera mmudzi mwa Mwamalopa, kwa Paramount Chief Kyungu mboma la Karonga. Sindili pabanja pakadalipano.\n Mbiri ya maphunziro anu ndi yotani? Maphunziro anga a pulaimale ndidachitira kusukula yapulaiveti ya Viphya mumzinda wa Mzuzu ndipo asekondale ndidachitira pa Phwezi Boys mboma la Rumphi. Ndili ndi diploma ya Accounting ndi

In [None]:
w2vmodel.wv["pulezidenti"] #This means president
w2vmodel.wv.most_similar("pulezidenti", topn=10)
#Saulos Chilima - current vice president of Malawi
#Nawo_pampando - participate in the chair
#Chipanicho - the party
#Zamtengatenga - transport
#akufuna_kupikisana - he wants to compete
#mneneli - speaker
#ulamuliro_wa - rule of
#chipanichi - party
#wachiwiri_kwa - second to
#msonkhano_waukulu - general assembly

[('Saulos_Chilima', 0.9732174277305603),
 ('nawo_pampando', 0.9731787443161011),
 ('chipanicho', 0.9722374677658081),
 ('zamtengatenga', 0.9721649885177612),
 ('akufuna_kupikisana', 0.9694526195526123),
 ('mneneli', 0.9682236313819885),
 ('ulamuliro_wa', 0.9662355184555054),
 ('chipanichi', 0.965084433555603),
 ('wachiwiri_kwa', 0.9643409252166748),
 ('msonkhano_waukulu', 0.9640751481056213)]

# With NN Embeddings

In [None]:
train_sentences, test_sentences, y_train, y_test = train_test_split(train_data, train_labels, test_size = .1, stratify = train_labels)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
x_train = tokenizer.texts_to_sequences(train_sentences)
x_test = tokenizer.texts_to_sequences(test_sentences)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
print(train_sentences[0])
print(x_train[0])
maxLen = max([len(x_train[i]) for i in range(len(x_train))])
print(maxLen)
x_train = pad_sequences(x_train, padding = 'post', maxlen=maxLen)
x_test = pad_sequences(x_test, padding = 'post', maxlen=maxLen)
x_train = np.array(x_train, dtype=np.float)
x_test = np.array(x_test, dtype=np.float)
y_train = np.array(y_train, dtype=np.float)
y_test = np.array(y_test, dtype=np.float)

47433
 Mwangonde: Khansala wachinyamata Akamati achinyamata ndi atsogoleri a mawa, ambiri amaganiza kuti izi ndi nkhambakamwa chabe. Koma achinyamata ena, monga Lusubilo Mwangonde, akukwaniritsa akupherezetsa mawuwa osati pongolota kuti adzakhala, koma kutsogolera kumene chifukwa nthawi yawo yakwana. DAILES BANDA adacheza ndi Mwangonde, khansala wachinyama, yemwe akuimira Jumbo Ward mumzinda wa Mzuzu, motere:  Chisale watuluka nkumangidwanso  Sipakala waimitsa Nyumba ya Malamulo  Pa Wenela pasintha zedi Ali ndi masomphenya: Mwangonde Tikudziweni  Ndine Lusubilo Mwangonde, ndili ndi zaka 27 zakubadwa. Ndinabadwa mbanja la ana asanu ndipo ndine wachinayi kubadwa. Ndimachokera mmudzi mwa Mwamalopa, kwa Paramount Chief Kyungu mboma la Karonga. Sindili pabanja pakadalipano.
 Mbiri ya maphunziro anu ndi yotani? Maphunziro anga a pulaimale ndidachitira kusukula yapulaiveti ya Viphya mumzinda wa Mzuzu ndipo asekondale ndidachitira pa Phwezi Boys mboma la Rumphi. Ndili ndi diploma ya Accounting

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  app.launch_new_instance()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [None]:
embedding_dim = 100

model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = maxLen))
model.add(keras.layers.Conv1D(128, 5, activation='relu'))
model.add(keras.layers.GlobalMaxPool1D())
model.add(keras.layers.Dense(20, activation = 'relu'))
model.add(keras.layers.Dense(20, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 909, 100)          4743300   
                                                                 
 conv1d (Conv1D)             (None, 905, 128)          64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_4 (Dense)             (None, 20)                2580      
                                                                 
 dense_5 (Dense)             (None, 20)                420       
                                                                 
Total params: 4,810,428
Trainable params: 4,810,428
Non-trainable params: 0
____________________________________________

In [None]:
model.fit(x_train, y_train, epochs = 30, validation_data = (x_test, y_test))

In [None]:
model.evaluate(x_test, y_test, verbose=False)[1]

0.5277777910232544

# Grid Search with stratified 5-fold validation

Had memory crash, before that best performance was 64.9% with 600, relu, .8, countvectorizer, withoutw2v

In [None]:
vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(train_sentences)
x_test_vec = vectorizer.transform(test_sentences)
x_train_vec.shape

(1292, 47229)

In [None]:
def createVecModel(input_size, denseLayer, activation, dropout):
    vecModel = keras.models.Sequential()
    vecModel.add(keras.layers.InputLayer(input_size))
    vecModel.add(keras.layers.Dense(denseLayer, activation=activation))
    vecModel.add(keras.layers.Dropout(dropout))
    vecModel.add(keras.layers.Dense(20, activation='softmax'))
    vecModel.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    return vecModel

In [None]:
def vectorize_w2v(s):
  vec = 0
  n = 0
  for token in word_tokenize(s):
    try:
      vec += w2vmodel.wv[token]
      n += 1
    except:
      pass
  return vec / n

In [None]:
params = [[400, 600, 800],
          ['relu', 'elu', 'selu'],
          [.4, .6, .8, .9],
          ["countvectorizer", "tfidf"],
          ["withw2v", "withoutw2v"]
         ]
params = itertools.product(*params)
params = list(params)
len(params)

144

In [None]:
skf = StratifiedKFold(n_splits=5)
best_acc = 0
best_params = None
for i in tq.tqdm(range(100, len(params))):
    combo = params[i]
    denseLayer, activation, dropout, mode, w2v = combo
    acc = 0
    for trainidx, testidx in skf.split(train_data, LabelEncoder().fit_transform(train_labels)):
        x_train, y_train = train_data[trainidx], train_labels[trainidx]
        x_test, y_test = train_data[testidx], train_labels[testidx]

        if mode == 'countvectorizer':
          vectorizer = CountVectorizer()
        else:
          vectorizer = TfidfVectorizer()
        
        x_train_vec = vectorizer.fit_transform(x_train)
        x_test_vec = vectorizer.transform(x_test)
        
        x_train_vec = x_train_vec.toarray()
        x_test_vec = x_test_vec.toarray()
        if w2v == 'withw2v':
          x_train_w2v = np.array([vectorize_w2v(x) for x in x_train.values])
          x_test_w2v = np.array([vectorize_w2v(x) for x in x_test.values])
          x_train_vec = np.hstack([x_train_vec, x_train_w2v])
          x_test_vec = np.hstack([x_test_vec, x_test_w2v])

        y_train = np.array(y_train, dtype='float')
        y_test = np.array(y_test, dtype='float')
        
        vecModel = createVecModel(x_train_vec.shape[1], denseLayer, activation, dropout)
        vecModel.fit(x_train_vec, y_train, epochs = 15, verbose = False)
        acc += vecModel.evaluate(x_test_vec, y_test, verbose = False)[1]

        del vecModel
    acc /= 5
    
    if acc > best_acc:
        best_acc = acc
        best_params = combo
        print("Best Params: ", best_params)
        print("Best Accuracy: ", best_acc)
    

print(best_params, best_acc)
with open("best_params_acc.txt", "wb") as file:
    pickle.dump((best_params, best_acc), file)

  0%|          | 0/44 [00:00<?, ?it/s]

Best Params:  (800, 'relu', 0.6, 'countvectorizer', 'withw2v')
Best Accuracy:  0.6365103602409363
Best Params:  (800, 'relu', 0.6, 'countvectorizer', 'withoutw2v')
Best Accuracy:  0.6378968358039856
Best Params:  (800, 'relu', 0.6, 'tfidf', 'withoutw2v')
Best Accuracy:  0.6483376979827881
Best Params:  (800, 'elu', 0.9, 'tfidf', 'withoutw2v')
Best Accuracy:  0.6483425259590149
Best Params:  (800, 'selu', 0.6, 'tfidf', 'withoutw2v')
Best Accuracy:  0.6490393877029419
(800, 'selu', 0.6, 'tfidf', 'withoutw2v') 0.6490393877029419


In [None]:
x_train = train["Text"]
y_train = train["Label"]
vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train).toarray()
y_train = np.array(y_train, dtype='float')
model = createVecModel(x_train_vec.shape[1], 800, 'selu', .6)
model.fit(x_train_vec, y_train, epochs=15, class_weight = class_weights)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9349d1a7d0>

# Using 800, selu, tfidf, .6, withoutw2v for predictions



In [None]:
test = pd.read_csv("Test.csv")
x_test = test["Text"]

x_test_vec = vectorizer.transform(x_test)
x_test_vec = x_test_vec.toarray()
preds = np.argmax(model.predict(x_test_vec), axis=1)
preds = [int2Label[pred] for pred in preds]
for i in range(len(preds)):
  if preds[i] == 'OTHER':
    preds[i] = np.random.choice(["FLOODING", "ARTS AND CRAFTS", "TRANSPORT", "MUSIC", "WITCHCRAFT", "CULTURE", "LOCALCHIEFS", "OPINION/ESSAY"])
test["Label"] = preds
test[["ID", "Label"]].to_csv("submission_nn.csv", index=False)

combiner = pd.read_csv("combiner.csv")
combiner['Neural Network'] = preds
combiner.to_csv('combiner.csv', index = False)
combiner.head()

FileNotFoundError: ignored

In [None]:
x_train = train_data
y_train = train_labels

vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_train_vec = x_train_vec.toarray()
y_train = np.array(y_train, dtype='float')

model = createVecModel(x_train_vec.shape[1], 600, 'selu', .9)
model.fit(x_train_vec, y_train, epochs=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f2b4516bfd0>

In [None]:
test = pd.read_csv("Test.csv")
x_test = test["Text"]
x_test_vec = vectorizer.transform(x_test)
x_test_vec = x_test_vec.toarray()
preds = np.argmax(model.predict(x_test_vec), axis=1)
preds = [int2Label[pred] for pred in preds]
for i in range(len(preds)):
  if preds[i] == 'OTHER':
    preds[i] = np.random.choice(["FLOODING", "ARTS AND CRAFTS", "TRANSPORT", "MUSIC", "WITCHCRAFT", "CULTURE", "LOCALCHIEFS", "OPINION/ESSAY"])
test["Label"] = preds
test[["ID", "Label"]].to_csv("submission_nn.csv", index=False)

# With word2vec

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size = .15, stratify = train_labels)
x_train = train_data
y_train = train_labels

test = pd.read_csv("Test.csv")
x_test = test["Text"]

vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train).toarray()
x_test_vec = vectorizer.transform(x_test).toarray()

x_train_w2v = np.array([vectorize_w2v(x) for x in x_train.values])
x_test_w2v = np.array([vectorize_w2v(x) for x in x_test.values])
y_train = np.array(y_train, dtype='float')
#y_test = np.array(y_test, dtype='float')

x_train_w2v = np.hstack([x_train_vec, x_train_w2v])
x_test_w2v = np.hstack([x_test_vec, x_test_w2v])


In [None]:
x_train_w2v.shape, x_test_w2v.shape

((1436, 50777), (620, 50777))

{0: 0.0035842293906810036,
 1: 0.006578947368421052,
 2: 0.006802721088435374,
 3: 0.007352941176470588,
 4: 0.007462686567164179,
 5: 0.007874015748031496,
 6: 0.011627906976744186,
 7: 0.01282051282051282,
 8: 0.02040816326530612,
 9: 0.023255813953488372,
 10: 0.02564102564102564,
 11: 0.027777777777777776,
 12: 0.038461538461538464,
 13: 0.04,
 14: 0.043478260869565216,
 15: 0.0625,
 16: 0.06666666666666667,
 17: 0.09090909090909091,
 18: 0.14285714285714285,
 19: 0.14285714285714285}

In [None]:
model = keras.Sequential()
model.add(keras.layers.InputLayer(x_train_w2v.shape[1]))
model.add(keras.layers.Dense(800, activation='selu'))
model.add(keras.layers.Dropout(.6))
model.add(keras.layers.Dense(20, activation='softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.fit(x_train_w2v, y_train, epochs = 20, class_weight = class_weights)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9349e32390>

In [None]:
preds = np.argmax(model.predict(x_test_w2v), axis=1)
preds = [int2Label[pred] for pred in preds]
for i in range(len(preds)):
  if preds[i] == 'OTHER':
    preds[i] = np.random.choice(["FLOODING", "ARTS AND CRAFTS", "TRANSPORT", "MUSIC", "WITCHCRAFT", "CULTURE", "LOCALCHIEFS", "OPINION/ESSAY"])
test["Label"] = preds
test[["ID", "Label"]].to_csv("submission_nn_w2v.csv", index=False)