In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/data/elongated_word_10k.csv')
X = df['news']
y = df['is_fake']
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
pipe_pac = Pipeline(
    [
        ("vectorizer", TfidfVectorizer()),
        ("classifier", PassiveAggressiveClassifier(max_iter=1000, random_state=42)),
    ],
    verbose=True,
)

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, random_state=17, shuffle=True)

scoring = "accuracy"

scores_pac = cross_val_score(pipe_pac, X_train, y_train, scoring=scoring, cv=cv)
print(f"{scoring}: %0.2f (+/- %0.2f)" % (scores_pac.mean(), scores_pac.std() * 2))

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   3.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.0s
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   1.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.0s
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   1.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.0s
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   1.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.1s
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   1.0s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.0s
accuracy: 0.95 (+/- 0.01)


In [None]:
pipe_pac.fit(X_train, y_train)

[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   2.7s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.1s


In [None]:
with open('/content/drive/My Drive/data/pac_pipeline.pkl', 'wb') as file:
    pickle.dump(pipe_pac, file)

In [None]:
y_pred = pipe_pac.predict(X_holdout)
print("\nHoldout Accuracy:", accuracy_score(y_holdout, y_pred))
print("\nClassification Report on Holdout Set:")
print(classification_report(y_holdout, y_pred))
print("\nConfusion Matrix on Holdout Set:")
print(confusion_matrix(y_holdout, y_pred))


Holdout Accuracy: 0.9545

Classification Report on Holdout Set:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      1000
           1       0.94      0.97      0.96      1000

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000


Confusion Matrix on Holdout Set:
[[942  58]
 [ 33 967]]


# FOR VALIDATION DATA

In [10]:
df_20k = pd.read_csv('/content/drive/MyDrive/data/elongated_word_20k.csv')

val_10k = df_20k.merge(df[['news']], on='news', how='left', indicator=True)
val_10k = val_10k[val_10k['_merge'] == 'left_only'].drop(columns=['_merge'])

val_10k = val_10k.sample(n=10000, random_state=42)

val_10k.to_csv('/content/drive/MyDrive/data/val_10k.csv')

In [9]:
df

Unnamed: 0,news,is_fake,tokens
0,covid <NUMBER> subvarian xb berbeda mematikan...,1,127
1,video salat berbahasa indonesia kursus salat d...,1,8
2,warga bogor meninggal hidup kembali ajaib ...,1,14
3,ikan alien aneh dijumpai di sungai madeira br...,1,27
4,foto foto pemberontakan ulama dan santri di ma...,1,21
...,...,...,...
9995,dicari partai partai pemersatu bangsa endor...,0,300
9996,ksal yudo akui siap jalani fit and proper test...,0,288
9997,veronica koman dan data tahanan politik papua ...,0,148
9998,p ungkap kader ingin usung anies di pilpres <N...,0,269


In [8]:
df

Unnamed: 0,news,is_fake,tokens
0,covid <NUMBER> subvarian xb berbeda mematikan...,1,127
1,video salat berbahasa indonesia kursus salat d...,1,8
2,warga bogor meninggal hidup kembali ajaib ...,1,14
3,ikan alien aneh dijumpai di sungai madeira br...,1,27
4,foto foto pemberontakan ulama dan santri di ma...,1,21
...,...,...,...
9995,dicari partai partai pemersatu bangsa endor...,0,300
9996,ksal yudo akui siap jalani fit and proper test...,0,288
9997,veronica koman dan data tahanan politik papua ...,0,148
9998,p ungkap kader ingin usung anies di pilpres <N...,0,269
