In [1]:
import os
import pandas as pd
import tqdm
import numpy as np

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import os

In [3]:
folder_name = '../../data/'

feature_column = "headline"
label_column = "category"

np.random.seed(42)

language = ['amh','eng','fra','hau','ibo','lin','pcm','run','swa','yor'][0]

In [4]:
print('-------------------------------------------------')
print(f'--------------Working on {language}-----------------')

train_data = pd.read_csv(f'{folder_name}/{language}/train.tsv',sep='\t')
dev_data = pd.read_csv(f'{folder_name}/{language}/dev.tsv',sep='\t')
test_data = pd.read_csv(f'{folder_name}/{language}/dev.tsv',sep='\t')

print(f' Training set size : {train_data.size}   Dev set size: {dev_data.size}')

-------------------------------------------------
--------------Working on amh-----------------
 Training set size : 5244   Dev set size: 752


In [5]:
all_text_list  = train_data[feature_column].values.tolist()+dev_data[feature_column].values.tolist() 
    
print('[INFO] Sample data \n',all_text_list[:3])

train_text,train_label = train_data[feature_column].values.tolist(),train_data[label_column].values.tolist()
dev_text,dev_label = dev_data[feature_column].values.tolist(),dev_data[label_column].values.tolist()
test_text,test_label = test_data[feature_column].values.tolist(),test_data[label_column].values.tolist()


unique_label = train_data[label_column].unique().tolist()

print('[INFO] Found Labels : ',unique_label)
#

[INFO] Sample data 
 ['የስፖርት ኮከቦች እና የንግድ ምልክቶቻቸው- ከቦልት እስከ ክርስቲያኖ ሮናልዶ', 'እግር ኳስ፡ ዩናይትድ፣ አርሴናል፣ ቼልሲ . . . ምን አስበዋል?', 'ዓለምን ካስጨነቃት የዋጋ ንረት ተጠቃሚዎቹ እነማን ናቸው?']
[INFO] Found Labels :  ['sports', 'business', 'health', 'politics']


In [6]:
vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1, 3))
vectorizer.fit_transform(all_text_list)

# # TfidfVectorizer
# vectorizer = TfidfVectorizer(analyzer='char_wb',ngram_range=(1, 3))
# vectorizer.fit_transform(all_text_list)

X_train = vectorizer.transform(train_text).toarray()
X_dev= vectorizer.transform(dev_text).toarray()
X_test= vectorizer.transform(test_text).toarray()

y_train = []
for i in train_label:
    y_train.append(unique_label.index(i))

y_dev = []
for i in dev_label:
    y_dev.append(unique_label.index(i))

y_test = []
for i in test_label:
    y_test.append(unique_label.index(i))

print(f'Sizes : {X_train.shape,X_dev.shape,X_test.shape,len(y_train),len(y_dev),len(y_test)}')


Sizes : ((1311, 19739), (188, 19739), (188, 19739), 1311, 188, 188)


In [7]:
print('=======   GaussianNB   =========')

classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_dev)

# Accuracy 
accuracy = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred, average='macro')


print(f'acc: {accuracy}     |  f1_score: {f1}')
print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))


if not os.path.exists(f"{language}/GaussianNB"):
    os.makedirs(f"{language}/GaussianNB")

acc = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred,average='weighted')
precision = metrics.precision_score(y_dev, y_pred,average='weighted')
recall = metrics.recall_score(y_dev, y_pred,average='weighted')

print(f"f1 = {f1}")
print(f"loss = {None}")
print(f"precision = {precision}")
print(f"recall = {recall}")

acc: 0.776595744680851     |  f1_score: 0.7611702347299674
              precision    recall  f1-score   support

      sports       0.80      0.87      0.84        47
    business       0.77      0.49      0.60        41
      health       0.76      0.88      0.81        50
    politics       0.77      0.82      0.80        50

    accuracy                           0.78       188
   macro avg       0.78      0.77      0.76       188
weighted avg       0.78      0.78      0.77       188

f1 = 0.7678229177446947
loss = None
precision = 0.7762396544134534
recall = 0.776595744680851


In [8]:
print('=======   MultinomialNB   =========')

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_dev)

# Accuracy 
accuracy = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred, average='macro')


print(f'acc: {accuracy}     |  f1_score: {f1}')
print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))

if not os.path.exists(f"{language}/MultinomialNB"):
    os.makedirs(f"{language}/MultinomialNB")

acc = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred,average='weighted')
precision = metrics.precision_score(y_dev, y_pred,average='weighted')
recall = metrics.recall_score(y_dev, y_pred,average='weighted')

print(f"f1 = {f1}")
print(f"loss = {None}")
print(f"precision = {precision}")
print(f"recall = {recall}")

acc: 0.8191489361702128     |  f1_score: 0.8130844477055927
              precision    recall  f1-score   support

      sports       0.95      0.79      0.86        47
    business       0.81      0.63      0.71        41
      health       0.73      0.92      0.81        50
    politics       0.83      0.90      0.87        50

    accuracy                           0.82       188
   macro avg       0.83      0.81      0.81       188
weighted avg       0.83      0.82      0.82       188

f1 = 0.8171517834477221
loss = None
precision = 0.8301959934273765
recall = 0.8191489361702128


In [9]:
print('=======   KNeighborsClassifier   =========')

classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_dev)

# Accuracy 
accuracy = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred, average='macro')


print(f'acc: {accuracy}     |  f1_score: {f1}')
print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))

if not os.path.exists(f"{language}/KNeighborsClassifier"):
    os.makedirs(f"{language}/KNeighborsClassifier")

acc = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred,average='weighted')
precision = metrics.precision_score(y_dev, y_pred,average='weighted')
recall = metrics.recall_score(y_dev, y_pred,average='weighted')

print(f"f1 = {f1}")
print(f"loss = {None}")
print(f"precision = {precision}")
print(f"recall = {recall}")

acc: 0.6276595744680851     |  f1_score: 0.6181197127187802
              precision    recall  f1-score   support

      sports       0.57      0.79      0.66        47
    business       0.50      0.44      0.47        41
      health       0.69      0.70      0.69        50
    politics       0.78      0.56      0.65        50

    accuracy                           0.63       188
   macro avg       0.63      0.62      0.62       188
weighted avg       0.64      0.63      0.62       188

f1 = 0.6246489759511754
loss = None
precision = 0.6407258538985697
recall = 0.6276595744680851


In [10]:
print('=======   MLPClassifier   =========')

classifier = MLPClassifier(random_state=1, max_iter=300)
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_dev)

# Accuracy 
accuracy = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred, average='macro')


print(f'acc: {accuracy}     |  f1_score: {f1}')
print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))

if not os.path.exists(f"{language}/MLPClassifier"):
    os.makedirs(f"{language}/MLPClassifier")

acc = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred,average='weighted')
precision = metrics.precision_score(y_dev, y_pred,average='weighted')
recall = metrics.recall_score(y_dev, y_pred,average='weighted')

print(f"f1 = {f1}")
print(f"loss = {None}")
print(f"precision = {precision}")
print(f"recall = {recall}")
    

acc: 0.8138297872340425     |  f1_score: 0.805532853873815
              precision    recall  f1-score   support

      sports       0.91      0.87      0.89        47
    business       0.77      0.59      0.67        41
      health       0.72      0.92      0.81        50
    politics       0.88      0.84      0.86        50

    accuracy                           0.81       188
   macro avg       0.82      0.80      0.81       188
weighted avg       0.82      0.81      0.81       188

f1 = 0.8108120092089613
loss = None
precision = 0.8204875409898574
recall = 0.8138297872340425


In [11]:
print('=======   XGBClassifier   =========')

classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_dev)

# Accuracy 
accuracy = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred, average='macro')


print(f'acc: {accuracy}     |  f1_score: {f1}')
print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))

if not os.path.exists(f"{language}/XGBClassifier"):
    os.makedirs(f"{language}/XGBClassifier")

acc = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred,average='weighted')
precision = metrics.precision_score(y_dev, y_pred,average='weighted')
recall = metrics.recall_score(y_dev, y_pred,average='weighted')

print(f"f1 = {f1}")
print(f"loss = {None}")
print(f"precision = {precision}")
print(f"recall = {recall}")

acc: 0.8138297872340425     |  f1_score: 0.8109039309503706
              precision    recall  f1-score   support

      sports       0.93      0.81      0.86        47
    business       0.80      0.68      0.74        41
      health       0.73      0.88      0.80        50
    politics       0.83      0.86      0.84        50

    accuracy                           0.81       188
   macro avg       0.82      0.81      0.81       188
weighted avg       0.82      0.81      0.81       188

f1 = 0.8136079688925352
loss = None
precision = 0.8211372134179606
recall = 0.8138297872340425


In [12]:
print('=======   SVC   =========')
classifier = SVC(gamma='auto')
classifier.fit(X_train, y_train)
# Predict Class
y_pred = classifier.predict(X_dev)

# Accuracy 
accuracy = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred, average='micro')


print(f'acc: {accuracy}     |  f1_score: {f1}')
print(metrics.classification_report(y_dev, y_pred, target_names=unique_label))

if not os.path.exists(f"{language}/SVC"):
    os.makedirs(f"{language}/SVC")

acc = metrics.accuracy_score(y_dev, y_pred)
f1 = metrics.f1_score(y_dev, y_pred,average='weighted')
precision = metrics.precision_score(y_dev, y_pred,average='weighted')
recall = metrics.recall_score(y_dev, y_pred,average='weighted')

print(f"f1 = {f1}")
print(f"loss = {None}")
print(f"precision = {precision}")
print(f"recall = {recall}")
    
    

acc: 0.3882978723404255     |  f1_score: 0.3882978723404255
              precision    recall  f1-score   support

      sports       0.00      0.00      0.00        47
    business       0.00      0.00      0.00        41
      health       0.68      0.46      0.55        50
    politics       0.32      1.00      0.49        50

    accuracy                           0.39       188
   macro avg       0.25      0.36      0.26       188
weighted avg       0.27      0.39      0.28       188

f1 = 0.2760146611836224
loss = None
precision = 0.26626221088048374
recall = 0.3882978723404255


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
