In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# LIBRARY UNTUK PREPROCESSING
import nltk #import library nltk
from nltk.tokenize import word_tokenize #import word_tokenize for tokenizing text into words 
from nltk.tokenize import sent_tokenize #import sent_tokenize for tokenizing paragraph into sentences
from nltk.stem.porter import PorterStemmer #import Porter Stemmer Algorithm 
from nltk.stem import WordNetLemmatizer #import WordNet lemmatizer 
from nltk.corpus import stopwords #import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #import Indonesian Stemmer
import re,string #import regular expression

import gensim
from gensim.models import KeyedVectors
from tqdm import tqdm
import warnings



In [2]:
warnings.filterwarnings("ignore")

In [3]:
dta = pd.read_csv('new_Data - Bismillah - Stemming.csv')

In [4]:
dtf = dta

In [5]:
dtf.head()

Unnamed: 0,Date,Text,Label,Text Clean,meaningless,Case Folding,normalisasi,Stopword,Stemming,tokenizing
0,2019-07-20,Bagaimana ini gk ada pemberitahuan kpn selesai...,-1,Bagaimana ini gk ada pemberitahuan kpn selesai...,Bagaimana gk ada pemberitahuan kpn selesainy...,bagaimana ini gk ada pemberitahuan kpn selesai...,bagaimana ini tidak ada pemberitahuan kapan se...,bagaimana ini tidak ada pemberitahuan kapan se...,bagaimana ini tidak ada pemberitahuan kapan se...,"['bagaimana', 'ini', 'tidak', 'ada', 'pemberit..."
1,2019-07-20,HOREEE. awal saldo nol. Setelah cek 175jt. gue...,0,HOREE awal saldo nol Setelah cek jt gue tarik ...,HOREE awal saldo nol Setelah cek jt gue tarik ...,horee awal saldo nol setelah cek jt gue tarik ...,hore awal saldo nol setelah cek juta saya tari...,hore awal saldo nol setelah cek juta saya tari...,hore awal saldo nol telah cek juta saya tarik ...,"['hore', 'awal', 'saldo', 'nol', 'telah', 'cek..."
2,2019-07-20,Bankerot nih namanya,-1,Bankerot nih namanya,Bankerot nih namanya,bankerot nih namanya,bangkrut ini namanya,bangkrut ini namanya,bangkrut ini nama,"['bangkrut', 'ini', 'nama']"
3,2019-07-20,Alhandulillah . . . .Rek Mandiri saya amaannn ...,1,Alhandulillah Rek Mandiri saya amaann masih se...,Alhandulillah Rek Mandiri amaann kemarin,alhandulillah rek mandiri saya amaann masih se...,alhamdulillah rekening mandiri saya aman masih...,alhamdulillah rekening mandiri saya aman masih...,alhamdulillah rekening mandiri saya aman masih...,"['alhamdulillah', 'rekening', 'mandiri', 'saya..."
4,2019-07-20,mohon infonya kapan mandiri bisa normal lagi u...,0,mohon infonya kapan mandiri bisa normal lagi u...,mohon infonya kapan mandiri normal lagi urus...,mohon infonya kapan mandiri bisa normal lagi u...,mohon infonya kapan mandiri bisa normal lagi u...,mohon infonya kapan mandiri bisa normal lagi u...,mohon info kapan mandiri bisa normal lagi urus...,"['mohon', 'info', 'kapan', 'mandiri', 'bisa', ..."


In [6]:
dtf = dtf.drop(columns=['Text', 'Text Clean', 'meaningless', 'Case Folding', 'normalisasi', 'Stopword', 'tokenizing'])

In [7]:
dtf['id'] = dtf.index
dtf = dtf.reindex(['id','Date','Stemming','Label'], axis=1)
dtf.head()

Unnamed: 0,id,Date,Stemming,Label
0,0,2019-07-20,bagaimana ini tidak ada pemberitahuan kapan se...,-1
1,1,2019-07-20,hore awal saldo nol telah cek juta saya tarik ...,0
2,2,2019-07-20,bangkrut ini nama,-1
3,3,2019-07-20,alhamdulillah rekening mandiri saya aman masih...,1
4,4,2019-07-20,mohon info kapan mandiri bisa normal lagi urus...,0


In [8]:
dtf['Label'].value_counts()

-1    7827
 1    6974
 0    5724
Name: Label, dtype: int64

In [9]:
cekDup = dtf.duplicated(subset = 'Stemming')
cekDup.sum()

167

In [10]:
dtf.drop_duplicates(subset='Stemming', inplace = True)
dtf.reset_index(drop=True,inplace=True)

In [11]:
cekDup = dtf.duplicated(subset = 'Stemming')
cekDup.sum()

0

In [12]:
dtf.isna().sum()

id          0
Date        0
Stemming    1
Label       0
dtype: int64

In [13]:
dtf.dropna(inplace=True)

In [14]:
dtf.isna().sum()

id          0
Date        0
Stemming    0
Label       0
dtype: int64

In [15]:
dtf['Label'].value_counts()

-1    7769
 1    6940
 0    5648
Name: Label, dtype: int64

## split using library

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(dtf['Stemming'], dtf['Label'], test_size=0.10, shuffle = True, random_state=12)

# FITUR EKSTRAKSI

## TF-IDF (UNIGRAM)

In [18]:
#mencoba tanpa max feature
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features = 7000)

In [19]:
# df_tfidf = vectorizer.fit_transform(dtf['Stemming'].astype(str))

x_train_tfidf = vectorizer.fit_transform(x_train.astype(str))
x_test_tfidf = vectorizer.transform(x_test.astype(str))

In [20]:
print(x_train_tfidf.shape)
print(x_test_tfidf.shape)

(18321, 7000)
(2036, 7000)


# MODELLING

In [21]:
# Membuat Model Logistic Regression
logreg = LogisticRegression()

## Baseline 

In [22]:
# Baseline
baseline_params = {"penalty" : ['l2'],
                   "dual" : [False], 
                   "tol" : [0.0001], 
                   "C" : [1.0], 
                   "fit_intercept" : [True], 
                   "intercept_scaling" : [1], 
                   "class_weight" : [None], 
                   "random_state" : [None], 
                   "solver" : ['lbfgs'], 
                   "max_iter" : [10000], 
                   "multi_class" : ['auto'], 
                   "verbose" : [0], 
                   "warm_start" : [False], 
                   "n_jobs" : [None], 
                   "l1_ratio" : [None]}

In [23]:
# Baseline Model GridSearch
baseline_clf = GridSearchCV(logreg, baseline_params, cv=5, refit = True, verbose = 3)
baseline_clf.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=10000, multi_class=auto, n_jobs=None, penalty=l2, random_state=None, solver=lbfgs, tol=0.0001, verbose=0, warm_start=False; total time=   1.6s
[CV 2/5] END C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=10000, multi_class=auto, n_jobs=None, penalty=l2, random_state=None, solver=lbfgs, tol=0.0001, verbose=0, warm_start=False; total time=   1.8s
[CV 3/5] END C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=10000, multi_class=auto, n_jobs=None, penalty=l2, random_state=None, solver=lbfgs, tol=0.0001, verbose=0, warm_start=False; total time=   1.5s
[CV 4/5] END C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=10000, multi_class=auto, n_jobs=None, penalty=l2

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1.0], 'class_weight': [None], 'dual': [False],
                         'fit_intercept': [True], 'intercept_scaling': [1],
                         'l1_ratio': [None], 'max_iter': [10000],
                         'multi_class': ['auto'], 'n_jobs': [None],
                         'penalty': ['l2'], 'random_state': [None],
                         'solver': ['lbfgs'], 'tol': [0.0001], 'verbose': [0],
                         'warm_start': [False]},
             verbose=3)

In [24]:
# Predict Baseline
y_predi_baseline = baseline_clf.predict(x_test_tfidf)
print(classification_report(y_test, y_predi_baseline))

              precision    recall  f1-score   support

          -1       0.72      0.82      0.77       764
           0       0.62      0.54      0.58       602
           1       0.79      0.75      0.77       670

    accuracy                           0.72      2036
   macro avg       0.71      0.71      0.71      2036
weighted avg       0.71      0.72      0.71      2036



In [25]:
accuracy_score(y_test, y_predi_baseline)

0.7161100196463654

## Parameter Tunning

In [26]:
#izn
# solver = ['sag','saga']
# penalty = ['none','l2']
# multi_class = ['multinomial','ovr']
# C = [1,2,5]

#tes
solver = ['saga']
penalty = ['l2']
multi_class = ['ovr']
C = [1]


# C = [0.1, 1, 10]
# penalty = ['l1', 'l2']
# penalty = ['l2']
# C = np.logspace(-4,4,20)
# C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# penalty = ['l2','none']
# C = [0.1, 1, 2, 5]
# solver = ['saga']

In [27]:
# Import to dictionary
hyperparameters = dict(C = C,  multi_class = multi_class, penalty = penalty, solver = solver, max_iter = [10000])

In [28]:
#Memasukan ke Grid Search
#CV itu Cross Validation
#Menggunakan 10-Fold CV
tuned_clf = GridSearchCV(logreg, hyperparameters, cv=5, refit = True, verbose = 3)

In [29]:
#Fitting Model
tuned_clf.fit(x_train_tfidf,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time=   0.3s
[CV 2/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time=   0.3s
[CV 3/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time=   0.3s
[CV 4/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time=   0.2s
[CV 5/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time=   0.2s


GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1], 'max_iter': [10000], 'multi_class': ['ovr'],
                         'penalty': ['l2'], 'solver': ['saga']},
             verbose=3)

In [30]:
#Nilai hyperparameters terbaik
print('Best Penalty:', tuned_clf.best_estimator_.get_params()['penalty'])
print('Best C:', tuned_clf.best_estimator_.get_params()['C'])
print('Best Solver:', tuned_clf.best_estimator_.get_params()['solver'])
print('Best Multi Class:', tuned_clf.best_estimator_.get_params()['multi_class'])

Best Penalty: l2
Best C: 1
Best Solver: saga
Best Multi Class: ovr


In [31]:
#Prediksi menggunakan model baru
y_predi_tuned = tuned_clf.predict(x_test_tfidf)

In [32]:
print(classification_report(y_test, y_predi_tuned))

              precision    recall  f1-score   support

          -1       0.71      0.84      0.77       764
           0       0.63      0.52      0.57       602
           1       0.79      0.75      0.77       670

    accuracy                           0.72      2036
   macro avg       0.71      0.71      0.71      2036
weighted avg       0.71      0.72      0.71      2036



In [33]:
accuracy_score(y_test, y_predi_tuned)

0.7175834970530451