In [None]:
from pathlib import Path

BASE_DIR = Path().resolve().parent
print(BASE_DIR)

import pandas as pd
csv_path = BASE_DIR / r'data\raw\combined_news_data.csv'

C:\Users\atalb\Documents\Coding\MLandAI\Fake-news-detection


In [13]:
news_data = pd.read_csv(csv_path)
news_data

Unnamed: 0,full_text,subject,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,News,0
1,Failed GOP Candidates Remembered In Hilarious...,News,0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,News,0
3,California AG pledges to defend birth control ...,politicsNews,1
4,AZ RANCHERS Living On US-Mexico Border Destroy...,politics,0
...,...,...,...
44893,Nigeria says U.S. agrees delayed $593 million ...,worldnews,1
44894,Boiler Room #62 – Fatal Illusions Tune in to t...,Middle-east,0
44895,ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...,Government News,0
44896,Republican tax plan would deal financial hit t...,politicsNews,1


In [14]:
news_data.shape

(44898, 3)

In [15]:
news_data.isna()

Unnamed: 0,full_text,subject,label
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
44893,False,False,False
44894,False,False,False
44895,False,False,False
44896,False,False,False


In [16]:
news_data['full_text']

0         BREAKING: GOP Chairman Grassley Has Had Enoug...
1         Failed GOP Candidates Remembered In Hilarious...
2         Mike Pence’s New DC Neighbors Are HILARIOUSLY...
3        California AG pledges to defend birth control ...
4        AZ RANCHERS Living On US-Mexico Border Destroy...
                               ...                        
44893    Nigeria says U.S. agrees delayed $593 million ...
44894    Boiler Room #62 – Fatal Illusions Tune in to t...
44895    ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...
44896    Republican tax plan would deal financial hit t...
44897    U.N. refugee commissioner says Australia must ...
Name: full_text, Length: 44898, dtype: object

In [17]:
import re 
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


def clean_text(text, lemmatize=False):
    text = text.lower()  

    text = remove_html(text)  
    text = re.sub(r"http\S+|www\S+", "", text)   
    text = re.sub(r"[^a-zA-Z]", " ", text)  

    tokens = text.split()   
    tokens = [w for w in tokens if w not in stop_words]  

    # if lemmatize:
    #     tokens = [lemmatizer.lemmatize(w) for w in tokens]  # ✅ optional lemmatization  

    return " ".join(tokens)

In [19]:
news_data['cleaned_text'] = news_data['full_text'].apply(lambda x: clean_text(x))
news_data

  soup = BeautifulSoup(text, "html.parser")


Unnamed: 0,full_text,subject,label,cleaned_text
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,News,0,breaking gop chairman grassley enough demands ...
1,Failed GOP Candidates Remembered In Hilarious...,News,0,failed gop candidates remembered hilarious moc...
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,News,0,mike pence new dc neighbors hilariously trolli...
3,California AG pledges to defend birth control ...,politicsNews,1,california ag pledges defend birth control ins...
4,AZ RANCHERS Living On US-Mexico Border Destroy...,politics,0,az ranchers living us mexico border destroy na...
...,...,...,...,...
44893,Nigeria says U.S. agrees delayed $593 million ...,worldnews,1,nigeria says u agrees delayed million fighter ...
44894,Boiler Room #62 – Fatal Illusions Tune in to t...,Middle-east,0,boiler room fatal illusions tune alternate cur...
44895,ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...,Government News,0,atheists sue governor texas display capitol gr...
44896,Republican tax plan would deal financial hit t...,politicsNews,1,republican tax plan would deal financial hit u...


In [20]:
cleaned_df = news_data[['cleaned_text', 'subject', 'label']]
cleaned_df

Unnamed: 0,cleaned_text,subject,label
0,breaking gop chairman grassley enough demands ...,News,0
1,failed gop candidates remembered hilarious moc...,News,0
2,mike pence new dc neighbors hilariously trolli...,News,0
3,california ag pledges defend birth control ins...,politicsNews,1
4,az ranchers living us mexico border destroy na...,politics,0
...,...,...,...
44893,nigeria says u agrees delayed million fighter ...,worldnews,1
44894,boiler room fatal illusions tune alternate cur...,Middle-east,0
44895,atheists sue governor texas display capitol gr...,Government News,0
44896,republican tax plan would deal financial hit u...,politicsNews,1


In [21]:
cleaned_csv = BASE_DIR / r'data\raw\cleaned_news_data.csv'
cleaned_df.to_csv(cleaned_csv, index=False)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfvec = TfidfVectorizer(ngram_range=(1,2), max_features=5000)

In [24]:
X = tfvec.fit_transform(cleaned_df['cleaned_text'])

In [51]:
feature_names = tfvec.get_feature_names_out()
feature_names

array(['abadi', 'abandoned', 'abc', ..., 'zones', 'zor', 'zuma'],
      dtype=object)

In [27]:
X[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [32]:
y = pd.get_dummies(cleaned_df['label'])
y = y.iloc[:,1].values
y = y.astype(int)
y

array([0, 0, 0, ..., 0, 1, 1])

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, cross_validate

lr = LogisticRegression(max_iter=1000)

# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # scores = cross_val_score(lr, X, y, cv = kf, scoring='accuracy')
# # print("Scores per fold:", scores)
# # print("Mean accuracy:", scores.mean())
# # print("Std deviation:", scores.std())


# scoring = ['accuracy', 'precision', 'recall', 'f1']

# results = cross_validate(lr, X, y, cv=5, scoring=scoring)

# print("Accuracy:", results['test_accuracy'].mean())
# print("Precision:", results['test_precision'].mean())
# print("Recall:", results['test_recall'].mean())
# print("F1:", results['test_f1'].mean())


In [44]:
model = lr.fit(X_train, y_train)

In [45]:
y_pred = model.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score, classification_report

score = accuracy_score(y_test, y_pred)
print(score)

0.9905345211581291


In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4669
           1       0.99      0.99      0.99      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [53]:
coefficients = model.coef_[0]
coefficients

array([ 0.24658555,  0.15959395, -0.75998545, ..., -0.08712864,
        0.52604751,  1.32572683])

In [54]:
import numpy as np

# Top FAKE indicators
top_fake_idx = np.argsort(coefficients)[-20:]  # 20 most positive
top_fake_words = feature_names[top_fake_idx]

# Top REAL indicators
top_real_idx = np.argsort(coefficients)[:20]  # 20 most negative
top_real_words = feature_names[top_real_idx]

In [55]:
print(top_real_words)
print(top_fake_words)

['via' 'video' 'us' 'read' 'president trump' 'hillary' 'gop' 'image'
 'obama' 'featured image' 'watch' 'mr' 'featured' 'even' 'america' 'wire'
 'image via' 'breaking' 'com' 'getty']
['democratic' 'presidential' 'comment' 'said statement' 'president barack'
 'monday' 'minister' 'edt' 'nov' 'friday' 'tuesday' 'thursday' 'wednesday'
 'reuters president' 'republican' 'washington' 'president donald'
 'washington reuters' 'said' 'reuters']


In [48]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

In [50]:
nb_model = nb.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.9430957683741648
