# Text classification

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
class CreateDataset(object):
    def __init__(self, dataframe, category=None):
        data = dataframe["text"].tolist()
        target = dataframe["label"].to_numpy()
        
        self.target_names = ['non-violence', 'violence']
        
        if category == "train":
            self.data = data[:150]
            self.target = target[:150]
        elif category == "test":
            self.data = data[150:]
            self.target = target[150:]
        else:
            print('set category="train" | "test"')

In [3]:
demo_news = pd.read_excel("selected-training-data.xlsx", sheet_name="Sheet1", engine="openpyxl",
                          nrows=200)
demo_news = demo_news.melt(var_name="label", value_name="text")
demo_news

Unnamed: 0,label,text
0,violence,"BANDUNG, (PR).- Sidang kasus penganiayaan dua ..."
1,violence,"BOGOR, (PR).- Ratusan warga dari Kelurahan Kat..."
2,violence,"BANDUNG, (PR).- Pendukung Persib Bandung tenga..."
3,violence,"SUMEDANG, (PR).- Sidang Paripurna Istimewa DPR..."
4,violence,"NGAMPRAH, (PR).- Manajemen PT Ultrajaya Milk I..."
...,...,...
217,non-violence,"TRIBUNJABAR.ID, INDRAMAYU- Krisis air bersih m..."
218,non-violence,"TRIBUNJABAR.ID, TASIKMALAYA - Badan Penanggula..."
219,non-violence,"TRIBUNJABAR.ID, CIANJUR - Bencana kekeringan m..."
220,non-violence,"TRIBUNJABAR.ID, MAJALENGKA - Warga Desa Paning..."


In [4]:
demo_news_shuffled = demo_news.sample(frac=1, random_state=42).reset_index(drop=True)

# encode non-violence and violence
demo_news_shuffled["label"] = np.where(demo_news_shuffled["label"].str.match("violence"), 1, 0)
demo_news_shuffled

Unnamed: 0,label,text
0,0,"INDRAMAYU, (PR).- PT Polytama Propindo members..."
1,0,"TRIBUNJABAR.ID, BANDUNG- Ribuan pelanggan air ..."
2,1,"BANDUNG, (PR).- Kantor KPU Jawa Barat didatang..."
3,0,Purwakarta (ANTARA) - Dinas Pemadam Kebakaran ...
4,0,Jemaat melaksanakan ibadah kebaktian dengan me...
...,...,...
217,1,"TRIBUNCIREBON, MAJALENGKA - Kepala Desa (Kuwu)..."
218,1,"CIBINONG, (PR).- Polres Bogor mengibarkan bend..."
219,1,"\nTRIBUNJABAR.ID, BANDUNG - Ribuan mahasiswa k..."
220,0,"Sukabumi, Jabar (ANTARA) - Badan Penanggulanga..."


In [5]:
news_train = CreateDataset(demo_news_shuffled, category="train")
news_test = CreateDataset(demo_news_shuffled, category="test")

## Naïve Bayes classifier

In [6]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [7]:
text_clf.fit(news_train.data, news_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

### Evaluation of the performance on the test set

In [8]:
predicted = text_clf.predict(news_test.data)
np.mean(predicted == news_test.target)

0.9305555555555556

In [9]:
print("Naïve Bayes classifier\n")
print(metrics.classification_report(news_test.target, predicted, 
                                   target_names=news_test.target_names))

Naïve Bayes classifier

              precision    recall  f1-score   support

non-violence       0.87      1.00      0.93        34
    violence       1.00      0.87      0.93        38

    accuracy                           0.93        72
   macro avg       0.94      0.93      0.93        72
weighted avg       0.94      0.93      0.93        72



In [10]:
metrics.confusion_matrix(news_test.target, predicted)

array([[34,  0],
       [ 5, 33]], dtype=int64)

## SVM classifier

In [11]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None))
])

In [12]:
text_clf.fit(news_train.data, news_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

### Evaluation of the performance on the test set

In [13]:
predicted = text_clf.predict(news_test.data)
np.mean(predicted == news_test.target)

0.9861111111111112

In [14]:
print("SVM classifier\n")
print(metrics.classification_report(news_test.target, predicted, 
                                   target_names=news_test.target_names))

SVM classifier

              precision    recall  f1-score   support

non-violence       0.97      1.00      0.99        34
    violence       1.00      0.97      0.99        38

    accuracy                           0.99        72
   macro avg       0.99      0.99      0.99        72
weighted avg       0.99      0.99      0.99        72



In [15]:
metrics.confusion_matrix(news_test.target, predicted)

array([[34,  0],
       [ 1, 37]], dtype=int64)