In [None]:
!pip install nltk




In [2]:
from google.colab import files
uploaded = files.upload()


Saving Fake.csv to Fake.csv
Saving True.csv to True.csv


In [3]:
import pandas as pd

# Read CSVs
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

# Add labels
fake_df["label"] = 0  # Fake
real_df["label"] = 1  # Real

# Combine
data = pd.concat([fake_df, real_df], axis=0)
data = data.sample(frac=1).reset_index(drop=True)

# Keep only text and label
data = data[["text", "label"]]
data.head()


Unnamed: 0,text,label
0,This is hypocrisy of the highest order and pro...,0
1,In a stunning example of just how morally bank...,0
2,The Obama s just added millions onto the taxpa...,0
3,Anti-Trumper and faux conservative Jedediah Bi...,0
4,NEW YORK (Reuters) - Seventeen congressional R...,1


In [4]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub('\[.*?\]', '', text)  # remove [text]
    text = re.sub('\\W', ' ', text)  # remove non-word chars
    text = re.sub('https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub('<.*?>+', '', text)  # remove HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # remove punctuation
    text = re.sub('\n', '', text)  # remove newlines
    text = re.sub('\w*\d\w*', '', text)  # remove digits
    return text


In [6]:
data["text"] = data["text"].apply(clean_text)
data.head()


Unnamed: 0,text,label
0,this is hypocrisy of the highest order and pro...,0
1,in a stunning example of just how morally bank...,0
2,the obama s just added millions onto the taxpa...,0
3,anti trumper and faux conservative jedediah bi...,0
4,new york reuters seventeen congressional r...,1


In [7]:
from sklearn.model_selection import train_test_split

X = data["text"]
y = data["label"]

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF object
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [9]:
from sklearn.linear_model import PassiveAggressiveClassifier

# Create and train the model
model = PassiveAggressiveClassifier(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict
y_pred = model.predict(X_test_tfidf)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Full report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9933184855233853
Confusion Matrix:
 [[4636   39]
 [  21 4284]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      4675
           1       0.99      1.00      0.99      4305

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [11]:
import pickle

# Save model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save vectorizer
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)


In [12]:
from google.colab import files

# Download both files
files.download('model.pkl')
files.download('tfidf.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>