In [1]:
import pandas as pd
import nltk
import numpy as np

In [2]:
train_df = pd.read_csv('data_worthcheck/train.csv', usecols=['text_a', 'label'])
test_df = pd.read_csv('data_worthcheck/test.csv')

In [3]:
train_df.head()

Unnamed: 0,text_a,label
0,betewe buka twitter cuman ngetweet liat home b...,no
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,no
2,e100ss gini buka informasi sejelas nya identit...,yes
3,neng solo wes ono terduga corona cobo neng ati...,no
4,midiahn nii akun gak takut takut nya isu coron...,no


In [4]:
test_df.head()

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes


In [5]:
train_df['label'] = train_df['label'].map({'yes':1, 'no': 0})

In [6]:
test_df['label'] = test_df['label'].map({'yes':1, 'no': 0})

In [7]:
train_df['text_clean'] = train_df['text_a'].str.lower()
test_df['text_clean'] = test_df['text_a'].str.lower()

In [8]:
import re
import string

In [9]:
# digit removal
train_df['text_clean'] = train_df['text_clean'].apply(lambda x: re.sub(r"\d+", "", x))
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: re.sub(r"\d+", "", x))

In [10]:
# punctuation removal
train_df['text_clean'] = train_df['text_clean'].apply(lambda x: x.translate(str.maketrans("","",string.punctuation)))
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: x.translate(str.maketrans("","",string.punctuation)))

In [11]:
# tokenization
train_df['text_clean'] = train_df['text_clean'].apply(lambda x: nltk.tokenize.word_tokenize(x))
test_df['text_clean'] = test_df['text_clean'].apply(lambda x: nltk.tokenize.word_tokenize(x))

In [12]:
X_train = train_df['text_clean']
y_train = train_df['label']

X_test = test_df['text_clean']
y_test = test_df['label']

In [13]:
train_df.head()

Unnamed: 0,text_a,label,text_clean
0,betewe buka twitter cuman ngetweet liat home b...,0,"[betewe, buka, twitter, cuman, ngetweet, liat,..."
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,0,"[mas, piyuuu, mugo, corona, tuh, mulut, tersum..."
2,e100ss gini buka informasi sejelas nya identit...,1,"[ess, gini, buka, informasi, sejelas, nya, ide..."
3,neng solo wes ono terduga corona cobo neng ati...,0,"[neng, solo, wes, ono, terduga, corona, cobo, ..."
4,midiahn nii akun gak takut takut nya isu coron...,0,"[midiahn, nii, akun, gak, takut, takut, nya, i..."


### word2vec using train data

In [14]:
import gensim

In [15]:
w2v_model = gensim.models.Word2Vec(
    X_train,
    vector_size=100,
    window=5,
    min_count=2
)

In [16]:
w2v_model.wv.most_similar('menkes')

[('mpud', 0.9577487707138062),
 ('gusdur', 0.9220761656761169),
 ('parewa', 0.9126297831535339),
 ('staff', 0.9123938083648682),
 ('speechless', 0.910317599773407),
 ('terawan', 0.9095292687416077),
 ('menempuh', 0.9086610674858093),
 ('pecat', 0.9082935452461243),
 ('utuhwibowo', 0.9061846137046814),
 ('ruh', 0.9019810557365417)]

In [17]:
words = set(w2v_model.wv.index_to_key )

In [18]:
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train], dtype=object)
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test], dtype=object)

In [19]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

## training

### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
rf_gini_100 = RandomForestClassifier(random_state=0, criterion='gini', n_estimators=100).fit(X_train_vect_avg, y_train)

In [35]:
rf_gini_300 = RandomForestClassifier(random_state=0, criterion='gini', n_estimators=300).fit(X_train_vect_avg, y_train)

In [36]:
rf_entropy_100 = RandomForestClassifier(random_state=0, criterion='entropy', n_estimators=100).fit(X_train_vect_avg, y_train)

In [37]:
rf_entropy_300 = RandomForestClassifier(random_state=0, criterion='entropy', n_estimators=300).fit(X_train_vect_avg, y_train)

### XGBoost

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

In [39]:
xgb_friedman_100 = GradientBoostingClassifier(
                        criterion='friedman_mse', n_estimators=100, random_state=0).fit(X_train_vect_avg, y_train)

In [40]:
xgb_friedman_300 = GradientBoostingClassifier(
                        criterion='friedman_mse', n_estimators=300, random_state=0).fit(X_train_vect_avg, y_train)

In [41]:
xgb_se_100 = GradientBoostingClassifier(
                        criterion='squared_error', n_estimators=100, random_state=0).fit(X_train_vect_avg, y_train)

In [42]:
xgb_se_300 = GradientBoostingClassifier(
                        criterion='squared_error', n_estimators=300, random_state=0).fit(X_train_vect_avg, y_train)

### SVM

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [57]:
svc_auto = make_pipeline(StandardScaler(), SVC(gamma='auto', random_state=0)).fit(X_train_vect_avg, y_train)

In [58]:
svc_scale = make_pipeline(StandardScaler(), SVC(gamma='scale', random_state=0)).fit(X_train_vect_avg, y_train)

### Prediction

In [45]:
from sklearn.metrics import precision_score, recall_score

#### Random Forrest

In [47]:
y_pred_rf_gini_100 = rf_gini_100.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_rf_gini_100)
recall = recall_score(y_test, y_pred_rf_gini_100)
print('Gini with 100 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_rf_gini_100==y_test).sum()/len(y_pred_rf_gini_100), 3)))

Gini with 100 estimator
Precision: 0.727 / Recall: 0.55 / Accuracy: 0.834


In [48]:
y_pred_rf_gini_300 = rf_gini_300.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_rf_gini_300)
recall = recall_score(y_test, y_pred_rf_gini_300)
print('Gini with 300 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_rf_gini_300==y_test).sum()/len(y_pred_rf_gini_300), 3)))

Gini with 300 estimator
Precision: 0.743 / Recall: 0.556 / Accuracy: 0.839


In [49]:
y_pred_rf_entropy_100 = rf_entropy_100.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_rf_entropy_100)
recall = recall_score(y_test, y_pred_rf_entropy_100)
print('Entropy with 100 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_rf_entropy_100==y_test).sum()/len(y_pred_rf_entropy_100), 3)))

Entropy with 100 estimator
Precision: 0.742 / Recall: 0.57 / Accuracy: 0.841


In [50]:
y_pred_rf_entropy_300 = rf_entropy_300.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_rf_entropy_300)
recall = recall_score(y_test, y_pred_rf_entropy_300)
print('Entropy with 300 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_rf_entropy_300==y_test).sum()/len(y_pred_rf_entropy_300), 3)))

Entropy with 300 estimator
Precision: 0.74 / Recall: 0.569 / Accuracy: 0.841


#### XGBoost

In [51]:
y_pred_xgb_friedman_100 = xgb_friedman_100.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_xgb_friedman_100)
recall = recall_score(y_test, y_pred_xgb_friedman_100)
print('Friedman mse with 100 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_xgb_friedman_100==y_test).sum()/len(y_pred_xgb_friedman_100), 3)))

Friedman mse with 100 estimator
Precision: 0.676 / Recall: 0.54 / Accuracy: 0.819


In [52]:
y_pred_xgb_friedman_300 = xgb_friedman_300.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_xgb_friedman_300)
recall = recall_score(y_test, y_pred_xgb_friedman_300)
print('Friedman mse with 300 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_xgb_friedman_300==y_test).sum()/len(y_pred_xgb_friedman_300), 3)))

Friedman mse with 300 estimator
Precision: 0.686 / Recall: 0.557 / Accuracy: 0.824


In [53]:
y_pred_xgb_se_100 = xgb_se_100.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_xgb_se_100)
recall = recall_score(y_test, y_pred_xgb_se_100)
print('se with 100 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_xgb_se_100==y_test).sum()/len(y_pred_xgb_se_100), 3)))

se with 100 estimator
Precision: 0.676 / Recall: 0.54 / Accuracy: 0.819


In [54]:
y_pred_xgb_se_300 = xgb_se_300.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_xgb_se_300)
recall = recall_score(y_test, y_pred_xgb_se_300)
print('se with 300 estimator')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_xgb_se_300==y_test).sum()/len(y_pred_xgb_se_300), 3)))

se with 300 estimator
Precision: 0.686 / Recall: 0.557 / Accuracy: 0.824


#### SVC

In [59]:
y_pred_svc_auto = svc_auto.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_svc_auto)
recall = recall_score(y_test, y_pred_svc_auto)
print('Gamma=auto')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_svc_auto==y_test).sum()/len(y_pred_svc_auto), 3)))

Gamma=auto
Precision: 0.741 / Recall: 0.515 / Accuracy: 0.832


In [60]:
y_pred_svc_scale = svc_scale.predict(X_test_vect_avg)

precision = precision_score(y_test, y_pred_svc_scale)
recall = recall_score(y_test, y_pred_svc_scale)
print('Gamma=scale')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred_svc_scale==y_test).sum()/len(y_pred_svc_scale), 3)))

Gamma=scale
Precision: 0.741 / Recall: 0.515 / Accuracy: 0.832
