In [1]:
import pandas as pd

from pathlib import Path

BASE_DIR = Path().resolve().parent
cleaned_csv = BASE_DIR / r'data\raw\cleaned_news_data.csv'

cleaned_df = pd.read_csv(cleaned_csv)
cleaned_df

Unnamed: 0,cleaned_text,subject,label
0,breaking gop chairman grassley enough demands ...,News,0
1,failed gop candidates remembered hilarious moc...,News,0
2,mike pence new dc neighbors hilariously trolli...,News,0
3,california ag pledges defend birth control ins...,politicsNews,1
4,az ranchers living us mexico border destroy na...,politics,0
...,...,...,...
44893,nigeria says u agrees delayed million fighter ...,worldnews,1
44894,boiler room fatal illusions tune alternate cur...,Middle-east,0
44895,atheists sue governor texas display capitol gr...,Government News,0
44896,republican tax plan would deal financial hit u...,politicsNews,1


In [7]:
any(cleaned_df['cleaned_text'].isna())
cleaned_df['cleaned_text'] = cleaned_df['cleaned_text'].fillna("")

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfvec = TfidfVectorizer(ngram_range=(1,2), max_features=5000)

In [9]:
X = tfvec.fit_transform(cleaned_df['cleaned_text'])

In [10]:
feature_names = tfvec.get_feature_names_out()
feature_names

array(['abadi', 'abandoned', 'abc', ..., 'zones', 'zor', 'zuma'],
      dtype=object)

In [11]:
X[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [12]:
y = pd.get_dummies(cleaned_df['label'])
y = y.iloc[:,1].values
y = y.astype(int)
y

array([0, 0, 0, ..., 0, 1, 1])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, cross_validate

lr = LogisticRegression(max_iter=1000)

# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # scores = cross_val_score(lr, X, y, cv = kf, scoring='accuracy')
# # print("Scores per fold:", scores)
# # print("Mean accuracy:", scores.mean())
# # print("Std deviation:", scores.std())


# scoring = ['accuracy', 'precision', 'recall', 'f1']

# results = cross_validate(lr, X, y, cv=5, scoring=scoring)

# print("Accuracy:", results['test_accuracy'].mean())
# print("Precision:", results['test_precision'].mean())
# print("Recall:", results['test_recall'].mean())
# print("F1:", results['test_f1'].mean())


In [15]:
model = lr.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(y_test, y_pred)
print(score)

0.9905345211581291


In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4669
           1       0.99      0.99      0.99      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [19]:
coefficients = model.coef_[0]
coefficients

array([ 0.24658555,  0.15959395, -0.75998545, ..., -0.08712864,
        0.52604751,  1.32572683])

In [20]:
import numpy as np

# Top FAKE indicators
top_fake_idx = np.argsort(coefficients)[-20:]  # 20 most positive
top_fake_words = feature_names[top_fake_idx]

# Top REAL indicators
top_real_idx = np.argsort(coefficients)[:20]  # 20 most negative
top_real_words = feature_names[top_real_idx]

In [21]:
print(top_real_words)
print(top_fake_words)

['via' 'video' 'us' 'read' 'president trump' 'hillary' 'gop' 'image'
 'obama' 'featured image' 'watch' 'mr' 'featured' 'even' 'america' 'wire'
 'image via' 'breaking' 'com' 'getty']
['democratic' 'presidential' 'comment' 'said statement' 'president barack'
 'monday' 'minister' 'edt' 'nov' 'friday' 'tuesday' 'thursday' 'wednesday'
 'reuters president' 'republican' 'washington' 'president donald'
 'washington reuters' 'said' 'reuters']


In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

In [None]:
nb_model = nb.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.9430957683741648


In [23]:
import pickle

with open("fake_news_model.pkl", "wb") as f:
    pickle.dump(lr, f)