In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
data = pd.read_csv("Train.csv")
data

Unnamed: 0,ID,Text,Label
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER
...,...,...,...
1431,ID_zmTmmEio,Eni Minibus Ati Ali ndi Ufulu Wokweza Mitengo ...,TRANSPORT
1432,ID_znOlIaGQ,Kachali apepesa: Kulankhula motumbwa kuthe An...,POLITICS
1433,ID_znracTjN,Mawu supports non-fiction writers The Malawi ...,EDUCATION
1434,ID_ztdsmmva,Tame Mwawa: Phwete ndiye kudya kwake Sewero l...,SOCIAL ISSUES


In [4]:
X = data["Text"]
Y = data["Label"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = .1, stratify = Y)

In [5]:
vectorizer = CountVectorizer(max_features=1000)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)
x_train_vec

<1292x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 121112 stored elements in Compressed Sparse Row format>

In [6]:
mnb = MultinomialNB()
mnb.fit(x_train_vec, y_train)

MultinomialNB()

In [7]:
top_categories = ["POLITICS", "SOCIAL", "RELIGION", "LAW/ORDER", "SOCIAL ISSUES", "HEALTH", "ECONOMY", "FARMING"]

In [8]:
preds = mnb.predict(x_test_vec)
print(classification_report(y_test, preds))

                      precision    recall  f1-score   support

     ARTS AND CRAFTS       0.00      0.00      0.00         1
             CULTURE       0.50      0.50      0.50         2
             ECONOMY       0.44      0.44      0.44         9
           EDUCATION       0.57      1.00      0.73         4
             FARMING       1.00      0.75      0.86         8
            FLOODING       0.00      0.00      0.00         1
              HEALTH       0.89      0.62      0.73        13
           LAW/ORDER       0.67      0.57      0.62        14
         LOCALCHIEFS       0.50      0.50      0.50         2
               MUSIC       0.00      0.00      0.00         1
       OPINION/ESSAY       0.00      0.00      0.00         3
            POLITICS       0.70      0.57      0.63        28
       RELATIONSHIPS       0.80      1.00      0.89         4
            RELIGION       0.73      0.73      0.73        15
              SOCIAL       0.45      0.67      0.54        15
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
cluster_y_train = []
cluster_y_test = []
for label in y_train:
    if label in top_categories:
        cluster_y_train.append(1)
    else:
        cluster_y_train.append(0)
for label in y_test:
    if label in top_categories:
        cluster_y_test.append(1)
    else:
        cluster_y_test.append(0)
        
cluster_y_train = np.array(cluster_y_train)
cluster_y_test = np.array(cluster_y_test)

In [10]:
mnb_cluster = MultinomialNB()
mnb_cluster.fit(x_train_vec, cluster_y_train)

MultinomialNB()

In [11]:
preds = mnb_cluster.predict(x_test_vec)
print(classification_report(cluster_y_test, preds))

              precision    recall  f1-score   support

           0       0.50      0.66      0.57        29
           1       0.91      0.83      0.87       115

    accuracy                           0.80       144
   macro avg       0.70      0.74      0.72       144
weighted avg       0.82      0.80      0.81       144



### ADDED PART FOR TEST.CSV ###

In [12]:
test = pd.read_csv("Test.csv")
print(test.sample(5))
print(test.shape)
test_data = vectorizer.transform(test['Text'])

              ID                                               Text
43   ID_DceQPvFY  Blantyre CWO Ithandiza St. Peters Seminary Wol...
493  ID_qFaSwbLt  Alimbikitsa Maparishi Kukonza Misa za Ana Ofes...
289  ID_YhmxjqFW   UDF siidaganize za mgwirizanoAtupele Phungu w...
361  ID_elLSceTJ   Bungwe lilangiza alimi kubzala mbewu zopirira...
572  ID_wPNZsraS  Mlandu wa Chisankho cha Aphungu Udzapitilira p...
(620, 2)


In [13]:
test.loc[:,"Label"] = mnb.predict(test_data)

In [14]:
test[['ID','Label']]

Unnamed: 0,ID,Label
0,ID_ADHEtjTi,SOCIAL ISSUES
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,RELATIONSHIPS
3,ID_AUKYBbIM,LAW/ORDER
4,ID_AZnsVPEi,HEALTH
...,...,...
615,ID_zdpOUWyJ,POLITICS
616,ID_zhnOomuu,RELATIONSHIPS
617,ID_zmWHvBJb,LAW/ORDER
618,ID_zphjdFIb,SOCIAL ISSUES


In [15]:
test[['ID','Label']].to_csv("MultiNBsubmission.csv", index=False) 

### Website score: 0.6225806451612903 ###

### Combiner ###

In [16]:
combiner = pd.read_csv("combiner.csv")
combiner['MultiNB'] = test['Label']
combiner.to_csv('combiner.csv', index = False)
combiner.head()

Unnamed: 0,ID,Text,SGD,MultiNB
0,ID_ADHEtjTi,Abambo odzikhweza akuchuluka Kafukufuku wa ap...,SOCIAL ISSUES,SOCIAL ISSUES
1,ID_AHfJktdQ,Ambuye Ziyaye Ayamikira Aphunzitsi a Tilitonse...,RELIGION,RELIGION
2,ID_AUJIHpZr,Anatcheleza: Akundiopseza a gogo wanga Akundi...,RELATIONSHIPS,RELATIONSHIPS
3,ID_AUKYBbIM,Ulova wafika posauzana Adatenga digiri ya uph...,POLITICS,LAW/ORDER
4,ID_AZnsVPEi,"Dzombe kukoma, koma Kuyambira makedzana, pant...",HEALTH,HEALTH
