In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/data/elongated_word_10k.csv')
X = df['news']
y = df['is_fake']
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)
X_holdout_vectorized = vectorizer.transform(X_holdout)

logistic = LogisticRegression(solver='liblinear', random_state=42)

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

scores = cross_val_score(logistic, X_train_vectorized, y_train, cv=skf, scoring='accuracy')

print(f"Cross-Validation Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")

# Optional: Train on full training data and evaluate on holdout data
logistic.fit(X_train_vectorized, y_train)

# Predict on the holdout set
y_prob = logistic.predict_proba(X_holdout_vectorized)[:, 1]

cutoffs = np.linspace(0.1, 1.0, 10)

for cutoff in cutoffs:
  y_pred = (y_prob>=cutoff).astype(int)
  print(f"\n\n CUTOFF {cutoff}")
  print("\nHoldout Accuracy:", accuracy_score(y_holdout, y_pred))
  print("\nClassification Report on Holdout Set:")
  print(classification_report(y_holdout, y_pred))
  print("\nConfusion Matrix on Holdout Set:")
  print(confusion_matrix(y_holdout, y_pred))

Cross-Validation Accuracy: 0.94 (+/- 0.01)


 CUTOFF 0.1

Holdout Accuracy: 0.7955

Classification Report on Holdout Set:
              precision    recall  f1-score   support

           0       1.00      0.59      0.74      1000
           1       0.71      1.00      0.83      1000

    accuracy                           0.80      2000
   macro avg       0.85      0.80      0.79      2000
weighted avg       0.85      0.80      0.79      2000


Confusion Matrix on Holdout Set:
[[ 591  409]
 [   0 1000]]


 CUTOFF 0.2

Holdout Accuracy: 0.8815

Classification Report on Holdout Set:
              precision    recall  f1-score   support

           0       1.00      0.76      0.87      1000
           1       0.81      1.00      0.89      1000

    accuracy                           0.88      2000
   macro avg       0.90      0.88      0.88      2000
weighted avg       0.90      0.88      0.88      2000


Confusion Matrix on Holdout Set:
[[764 236]
 [  1 999]]


 CUTOFF 0.300000000000000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
with open('/content/drive/My Drive/data/logistic.pkl', 'wb') as file:
    pickle.dump(logistic, file)