In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [8]:
deploy_news = pd.read_excel("predict-2021-data.xlsx", sheet_name="Sheet1", engine="openpyxl",
                          nrows=200)

In [9]:
deploy_news

Unnamed: 0,bulan,berita
0,5,"PIKIRAN RAKYAT - Bupati Garut, Rudy Gunawan me..."
1,6,PIKIRAN RAKYAT - Polemik Gereja Kristen Indone...
2,1,PIKIRAN RAKYAT - Gubernur Jawa Barat Ridwan Ka...
3,12,"PIKIRAN RAKYAT - Wali Kota Bogor, Bima Arya Su..."
4,11,PIKIRAN RAKYAT - Penanganan pandemi covid-19 d...
...,...,...
92,3,"PIKIRAN RAKYAT - Kabid Distribusi Perdagangan,..."
93,2,PIKIRAN RAKYAT - Tingginya harga cabai rawit y...
94,1,PIKIRAN RAKYAT - Kenaikan harga daging sapi ya...
95,1,PIKIRAN RAKYAT - Sudah tiga hari terakhir di p...


In [10]:
train_news = pd.read_excel("selected-training-data.xlsx", sheet_name="Sheet1", engine="openpyxl",
                          nrows=200)

In [11]:
train_news

Unnamed: 0,violence,non-violence
0,"BANDUNG, (PR).- Sidang kasus penganiayaan dua ...","SOREANG, (PR).- Sedikitnya 104 eksemplar tablo..."
1,"BOGOR, (PR).- Ratusan warga dari Kelurahan Kat...","NGAMPRAH, (PR).- Lantaran tak ada dana operasi..."
2,"BANDUNG, (PR).- Pendukung Persib Bandung tenga...","NGAMPRAH, (PR).- Lantaran tak ada dana operasi..."
3,"SUMEDANG, (PR).- Sidang Paripurna Istimewa DPR...","CIKARANG, (PR).- Pemerintah Kabupaten Bekasi d..."
4,"NGAMPRAH, (PR).- Manajemen PT Ultrajaya Milk I...","TANPA alas kaki, Adimin (83) melangkah perlaha..."
...,...,...
106,"TRIBUNCIREBON, MAJALENGKA - Kepala Desa (Kuwu)...","TRIBUNJABAR.ID, INDRAMAYU- Krisis air bersih m..."
107,"TRIBUNJABAR.ID, BANDUNG - Di sela kegitannya b...","TRIBUNJABAR.ID, TASIKMALAYA - Badan Penanggula..."
108,"TRIBUNJABAR.ID,BANDUNG - Sejumlah warga Kabupa...","TRIBUNJABAR.ID, CIANJUR - Bencana kekeringan m..."
109,Dugaan tersebut berdasarkan temuan yang didapa...,"TRIBUNJABAR.ID, MAJALENGKA - Warga Desa Paning..."


In [12]:
train_news = train_news.melt(var_name="label", value_name="text")

In [13]:
train_news

Unnamed: 0,label,text
0,violence,"BANDUNG, (PR).- Sidang kasus penganiayaan dua ..."
1,violence,"BOGOR, (PR).- Ratusan warga dari Kelurahan Kat..."
2,violence,"BANDUNG, (PR).- Pendukung Persib Bandung tenga..."
3,violence,"SUMEDANG, (PR).- Sidang Paripurna Istimewa DPR..."
4,violence,"NGAMPRAH, (PR).- Manajemen PT Ultrajaya Milk I..."
...,...,...
217,non-violence,"TRIBUNJABAR.ID, INDRAMAYU- Krisis air bersih m..."
218,non-violence,"TRIBUNJABAR.ID, TASIKMALAYA - Badan Penanggula..."
219,non-violence,"TRIBUNJABAR.ID, CIANJUR - Bencana kekeringan m..."
220,non-violence,"TRIBUNJABAR.ID, MAJALENGKA - Warga Desa Paning..."


In [14]:
train_news["label"] = np.where(train_news["label"].str.match("violence"), 1, 0)

In [15]:
train_news

Unnamed: 0,label,text
0,1,"BANDUNG, (PR).- Sidang kasus penganiayaan dua ..."
1,1,"BOGOR, (PR).- Ratusan warga dari Kelurahan Kat..."
2,1,"BANDUNG, (PR).- Pendukung Persib Bandung tenga..."
3,1,"SUMEDANG, (PR).- Sidang Paripurna Istimewa DPR..."
4,1,"NGAMPRAH, (PR).- Manajemen PT Ultrajaya Milk I..."
...,...,...
217,0,"TRIBUNJABAR.ID, INDRAMAYU- Krisis air bersih m..."
218,0,"TRIBUNJABAR.ID, TASIKMALAYA - Badan Penanggula..."
219,0,"TRIBUNJABAR.ID, CIANJUR - Bencana kekeringan m..."
220,0,"TRIBUNJABAR.ID, MAJALENGKA - Warga Desa Paning..."


In [46]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [20]:
class CreateDataset(object):
    def __init__(self, dataframe, category=None):
        data = dataframe["text"].tolist()
        target = dataframe["label"].to_numpy()
        

In [25]:
news_train = CreateDataset(train_news)

In [47]:
text_clf.fit(train_news["text"], train_news["label"])

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [48]:
predicted = text_clf.predict(deploy_news["berita"])

In [51]:
predicted

array([1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0])

In [52]:
deploy_news["predict"]=predicted

In [53]:
print(deploy_news)

    bulan                                             berita  predict
0       5  PIKIRAN RAKYAT - Bupati Garut, Rudy Gunawan me...        1
1       6  PIKIRAN RAKYAT - Polemik Gereja Kristen Indone...        1
2       1  PIKIRAN RAKYAT - Gubernur Jawa Barat Ridwan Ka...        1
3      12  PIKIRAN RAKYAT - Wali Kota Bogor, Bima Arya Su...        0
4      11  PIKIRAN RAKYAT - Penanganan pandemi covid-19 d...        0
..    ...                                                ...      ...
92      3  PIKIRAN RAKYAT - Kabid Distribusi Perdagangan,...        0
93      2  PIKIRAN RAKYAT - Tingginya harga cabai rawit y...        0
94      1  PIKIRAN RAKYAT - Kenaikan harga daging sapi ya...        0
95      1  PIKIRAN RAKYAT - Sudah tiga hari terakhir di p...        0
96      1  PIKIRAN RAKYAT - Tahu dan tempe, makanan berba...        0

[97 rows x 3 columns]


In [54]:
file_name = 'data_2021.xlsx'

In [55]:
deploy_news.to_excel(file_name)
print('Data 2021 successfully exported into Excel File')

Data 2021 successfully exported into Excel File
