# Project: Fake News Classification

# 1. Imports & Setup

In [None]:
%pip install pandas
import pandas as pd
import numpy as np
%pip install matplotlib
import matplotlib.pyplot as plt
%pip install seaborn
import seaborn as sns
import re
import string

%pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix



# 2. Load Data

In [None]:
df = pd.read_csv("dataset/data.csv")
val_df = pd.read_csv("dataset/validation_data.csv")
df.head()
val_df.head()
# print(val_df['label'].unique())


# 3. Clening Up Data

In [23]:
# Remove duplicate rows based on the 'text' column
df = df.drop_duplicates(subset=['text']) 

# Remove rows with 'text' is NaN
df = df.dropna(subset=['text']) 

# Remove rows with 'label' is NaN
df = df.dropna(subset=['label']) 

# Remove rows with 'text' empty or only with whitespace
df = df[df['text'].str.strip() != ''] 

# 4. Preprocessing

In [24]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df['text_clean'] = df['title'] + " " + df['text']
# df['text_clean'] = df['text_clean'].apply(clean_text)


# 5. Data Exploration

In [None]:
df.info()
df['label'].value_counts().plot(kind='bar', title='Class Balance')


# 6. Train/Test Split

In [26]:
X = df['title'] + " " + df['text_clean']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 7. Vectorization + Model Training

In [27]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)


# 8. Evaluation

In [None]:
print(classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')


# 9. Predict Validation Set

In [29]:
val_df['text_clean'] = (val_df['title'] + " " + val_df['text']).apply(clean_text)
X_val_vec = vectorizer.transform(val_df['text_clean'])
val_df['label'] = model.predict(X_val_vec)

# Save predictions
val_df.reset_index(inplace=True)
val_df[['index', 'label']].to_csv("predictions.csv", index=False)
