In [1]:
import pandas as pd
import gensim
from gensim.models import FastText

import pandas as pd
import sklearn
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# LIBRARY UNTUK PREPROCESSING
import nltk #import library nltk
from nltk.tokenize import word_tokenize #import word_tokenize for tokenizing text into words 
from nltk.tokenize import sent_tokenize #import sent_tokenize for tokenizing paragraph into sentences
from nltk.stem.porter import PorterStemmer #import Porter Stemmer Algorithm 
from nltk.stem import WordNetLemmatizer #import WordNet lemmatizer 
from nltk.corpus import stopwords #import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #import Indonesian Stemmer
import re,string #import regular expression

import gensim
from gensim.models import KeyedVectors
from tqdm import tqdm



In [2]:
dta = pd.read_csv('new_Data - Bismillah - Stemming.csv')
dtf = dta

In [3]:
dtf.drop_duplicates(subset='Stemming', inplace = True)
dtf.reset_index(drop=True,inplace=True)

In [4]:
cekDup = dtf.duplicated(subset = 'Stemming')
cekDup.sum()

0

In [5]:
dtf['Label'].value_counts()

-1    7769
 1    6940
 0    5649
Name: Label, dtype: int64

In [6]:
dtf.isna().sum()

Date            0
Text            0
Label           0
Text Clean      1
meaningless     1
Case Folding    1
normalisasi     1
Stopword        1
Stemming        1
tokenizing      0
dtype: int64

In [7]:
dtf.dropna(inplace=True)

In [8]:
dtf.isna().sum()

Date            0
Text            0
Label           0
Text Clean      0
meaningless     0
Case Folding    0
normalisasi     0
Stopword        0
Stemming        0
tokenizing      0
dtype: int64

## CEK EKSPANSI

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dtf['Stemming'], dtf['Label'], test_size=0.10, shuffle = True, random_state=12)

In [10]:
#mencoba tanpa max feature
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features = 7000)

In [11]:
# df_tfidf = vectorizer.fit_transform(dtf['Stemming'].astype(str))

x_train_tfidf = vectorizer.fit_transform(x_train.astype(str))
x_test_tfidf = vectorizer.transform(x_test.astype(str))

In [12]:
print(x_train_tfidf.shape)
print(x_test_tfidf.shape)

(18321, 7000)
(2036, 7000)


In [13]:
# Load Model Ekspansi Fitur
model_ef = gensim.models.KeyedVectors.load_word2vec_format('cc.id.300.vec.gz')

In [14]:
# Load Model trained data actual corpus (sg = 0)
# model_ef = FastText.load('modelft-26jan-Dataactual')

In [15]:
# Load Model trained gab data actual dan pretrained corpus (sg = 0)
# model_ef = FastText.load('modelft-26jan-actualpretrained')

In [16]:
def feature_expansion(df, feature):
    for col in tqdm(df.columns): #loop per kolom
        try: 
            sim_word = model_ef.similar_by_word(col, topn = 1) #mencari Similarity
        except:
            sim_word = []
        if sim_word != []: #kalo similarity-nya tidak kosong
            for term in [sim_word[i][0] for i in range(len(sim_word))]: #loop per-word yang ada di Similarity
                if term in feature:
                    #untuk semua kolom yang mempunyai nilai 0 di kolom, tetapi mempunyai nilai yang bukan 0 pada kolom term
                    #nilainya diganti dengan nilai kolom term yang mempunyai nilai bukan 0
                    df[col][(df[col]==0) & (df[term]!=0)] = df[term][(df[col]==0) & (df[term]!=0)]
    return df

In [17]:
feature_ef = vectorizer.get_feature_names()

In [18]:
# Membuat dataframe data test & train

df_x_train_tfidf = pd.DataFrame(x_train_tfidf.todense(), columns = feature_ef)
df_x_test_tfidf = pd.DataFrame(x_test_tfidf.todense(), columns = feature_ef)

In [19]:
# Membuat Model Logistic Regression
logreg = LogisticRegression()

In [20]:
df_x_train_ef = feature_expansion(df_x_train_tfidf, feature_ef)

100%|██████████| 7000/7000 [13:39<00:00,  8.54it/s]


In [21]:
df_x_test_ef = feature_expansion(df_x_test_tfidf, feature_ef)

100%|██████████| 7000/7000 [13:51<00:00,  8.42it/s]


In [22]:
ef_params = {'penalty' : ['l2'],
            'C' : [1],
            'solver' : ['saga'],
            'multi_class' : ['ovr'],
            'max_iter' : [10000]}

ef_clf = GridSearchCV(logreg, ef_params, refit = True, verbose = 3, cv=5)
ef_clf.fit(df_x_train_ef,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time= 2.4min
[CV 2/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time= 2.6min
[CV 3/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time= 2.2min
[CV 4/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time= 2.3min
[CV 5/5] END C=1, max_iter=10000, multi_class=ovr, penalty=l2, solver=saga; total time= 1.7min


GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1], 'max_iter': [10000], 'multi_class': ['ovr'],
                         'penalty': ['l2'], 'solver': ['saga']},
             verbose=3)

In [23]:
y_predi_ef = ef_clf.predict(df_x_test_ef)

In [24]:
print(classification_report(y_test, y_predi_ef))

              precision    recall  f1-score   support

          -1       0.72      0.84      0.77       764
           0       0.63      0.53      0.58       602
           1       0.79      0.75      0.77       670

    accuracy                           0.72      2036
   macro avg       0.71      0.71      0.71      2036
weighted avg       0.71      0.72      0.71      2036



In [25]:
accuracy_score(y_test, y_predi_ef)

0.7180746561886051

## Cek nilai intercept(w0) , coef (wp)

In [26]:
lr = LogisticRegression(penalty='l2', C=1, solver='saga', max_iter=10000, multi_class='ovr')
lr.fit(df_x_train_ef,y_train)

LogisticRegression(C=1, max_iter=10000, multi_class='ovr', solver='saga')

In [27]:
y_predi_ef_new = lr.predict(df_x_test_ef)

In [28]:
print(classification_report(y_test, y_predi_ef_new))

              precision    recall  f1-score   support

          -1       0.72      0.84      0.77       764
           0       0.63      0.53      0.58       602
           1       0.79      0.75      0.77       670

    accuracy                           0.72      2036
   macro avg       0.71      0.71      0.71      2036
weighted avg       0.71      0.72      0.71      2036



In [29]:
accuracy_score(y_test, y_predi_ef_new)

0.7180746561886051

In [30]:
print(lr.classes_)

[-1  0  1]


In [31]:
#W0
print(lr.intercept_)

[-0.9742267  -0.0151603  -1.49498743]


In [32]:
#Wp
print(lr.coef_)

[[-0.11697668 -0.23284422  0.14729029 ... -0.00825756 -0.45683273
   0.08156204]
 [ 0.31487428  0.37215244  0.04929144 ... -0.07852135  0.44320813
  -0.10596031]
 [-0.21519281 -0.13489993 -0.15241508 ...  0.09036992 -0.01131184
  -0.0137427 ]]


In [33]:
cek_coef = np.vstack((lr.coef_.T, lr.intercept_))
cek_coef

array([[-0.11697668,  0.31487428, -0.21519281],
       [-0.23284422,  0.37215244, -0.13489993],
       [ 0.14729029,  0.04929144, -0.15241508],
       ...,
       [-0.45683273,  0.44320813, -0.01131184],
       [ 0.08156204, -0.10596031, -0.0137427 ],
       [-0.9742267 , -0.0151603 , -1.49498743]])