In [1]:
import pandas as pd

# Load the data
fake = pd.read_csv("data/Fake.csv")
true = pd.read_csv("data/True.csv")

# Add labels
fake['label'] = 0  # fake = 0
true['label'] = 1  # true = 1

# Combine both
data = pd.concat([fake, true], ignore_index=True)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Show a sample
data.head()


Unnamed: 0,title,text,subject,date,label
0,REP SHEILA JACKSON LEE Suggests First Class Cu...,Jean-Marie Simon was a passenger on a flight f...,politics,"Dec 26, 2017",0
1,SYRIA: British and American Presence Directly ...,US paratrooper on security duty during a miss...,Middle-east,"June 16, 2017",0
2,"German officials receive threatening letters, ...",Berlin (Reuters) - Five top German politicians...,worldnews,"September 21, 2017",1
3,Turkey's release of German citizen sign of tha...,BERLIN (Reuters) - Turkey s decision to releas...,worldnews,"October 26, 2017",1
4,TIME TO GO? 84-Yr Old Supreme Court Justice Gi...,Justice Ruth Bader Ginsburg erroneously labele...,left-news,"Apr 11, 2017",0


In [2]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Clean text function
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters and digits
    text = text.lower()                    # Convert to lowercase
    tokens = nltk.word_tokenize(text)      # Tokenize
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)


data.dropna(subset=['title', 'text'], inplace=True)

# Combine title and text, then apply cleaning
data['text'] = data['title'] + ' ' + data['text']
data['text'] = data['text'].apply(clean_text)

# Preview
data[['text', 'label']].head()



[nltk_data] Downloading package stopwords to C:\Users\samrat
[nltk_data]     majhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\samrat
[nltk_data]     majhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\samrat
[nltk_data]     majhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label
0,rep sheila jackson lee suggests first class cu...,0
1,syria british american presence directly escal...,0
2,german official receive threatening letter fak...,1
3,turkey release german citizen sign thawing tie...,1
4,time go yr old supreme court justice ginsberg ...,0


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert cleaned text to numerical feature vectors
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['text']).toarray()

# Set the label column
y = data['label']


In [4]:
from sklearn.model_selection import train_test_split

# Split data into training  and testing 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [6]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n🧾 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.9875278396436525

🧾 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      4648
           1       0.98      0.99      0.99      4332

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


🔍 Confusion Matrix:
 [[4575   73]
 [  39 4293]]


In [7]:
# Check prediction on one sample
print("Predicted label:", model.predict([X_test[0]]))
print("Actual label:", y_test.iloc[0])


Predicted label: [1]
Actual label: 1


In [8]:
import joblib

joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("✅ Model and Vectorizer saved successfully!")



✅ Model and Vectorizer saved successfully!
