In [11]:
import zipfile
import os
import pandas as pd

# Define paths
zip_path = r"C:\Users\melha\Downloads\archive.zip"
extract_to = r"C:\Users\melha\Documents\fake-news-detector\data"

# Unzip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("✅ Extracted!")

# Load datasets
fake_df = pd.read_csv(os.path.join(extract_to, "Fake.csv"))
real_df = pd.read_csv(os.path.join(extract_to, "True.csv"))

print("📰 Fake News Sample:")
print(fake_df.head())



✅ Extracted!
📰 Fake News Sample:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [13]:
import pandas as pd

# Add labels
fake_df['label'] = 0  # fake = 0
real_df['label'] = 1  # real = 1

# Combine
df = pd.concat([fake_df, real_df], ignore_index=True).sample(frac=1).reset_index(drop=True)

# Combine title + text into one field
df['text'] = df['title'] + ' ' + df['text']

# Drop unnecessary columns
df = df[['text', 'label']]

print(df.head())
print(df['label'].value_counts())


                                                text  label
0  Trump, at NATO, vows unwavering fight against ...      1
1  Hacking attacks: a pre-election setback for It...      1
2  THE STATE THAT GETS MORE REFUGEES THAN ANY OTH...      0
3  Philippines suspends trade with North Korea to...      1
4  Libyan force ready to cooperate on UK extradit...      1
label
0    23481
1    21417
Name: count, dtype: int64


In [18]:
pip install nltk





In [21]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train samples: {len(X_train)} | Test samples: {len(X_test)}")


Train samples: 35918 | Test samples: 8980


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use max_features to limit vocab size for speed
tfidf = TfidfVectorizer(max_features=5000)

# Fit only on training data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_tfidf)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[4635   71]
 [  33 4241]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4706
           1       0.98      0.99      0.99      4274

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

