# Data Download and Initial Exploration

This notebook downloads the Fake and Real News Dataset from Kaggle and performs initial exploration.

In [None]:
# Install required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import os
import kaggle

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Step 1: Download Dataset from Kaggle

**Before running this cell, you need to:**
1. Create a Kaggle account at kaggle.com
2. Go to Account settings and create an API token
3. Download the kaggle.json file
4. Place it in ~/.kaggle/ directory (create if doesn't exist)
5. Set permissions: `chmod 600 ~/.kaggle/kaggle.json`

In [None]:
# Download the dataset
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset -p ../data/ --unzip

print("Dataset downloaded successfully!")
print("Files in data directory:")
!ls -la ../data/

## Step 2: Load and Examine the Data

In [None]:
# Load the datasets
fake_df = pd.read_csv('../data/Fake.csv')
true_df = pd.read_csv('../data/True.csv')

print(f"Fake news articles: {len(fake_df)}")
print(f"True news articles: {len(true_df)}")

# Add labels
fake_df['label'] = 0  # Fake
true_df['label'] = 1  # True

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)
print(f"\nTotal articles: {len(df)}")

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nDataset info:")
print(df.info())

In [None]:
# Examine the structure
print("Column names:")
print(df.columns.tolist())

print("\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check label distribution
print("\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nPercentage - Fake: {(df['label']==0).mean():.1%}, True: {(df['label']==1).mean():.1%}")

## Step 3: Basic Text Statistics

In [None]:
# Calculate text length statistics
df['title_length'] = df['title'].str.len()
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

# Basic statistics
print("Text Length Statistics:")
print(df[['title_length', 'text_length', 'word_count']].describe())

In [None]:
# Visualize text length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Title length by label
sns.histplot(data=df, x='title_length', hue='label', bins=50, ax=axes[0,0])
axes[0,0].set_title('Title Length Distribution')
axes[0,0].legend(['Fake', 'True'])

# Text length by label
sns.histplot(data=df, x='text_length', hue='label', bins=50, ax=axes[0,1])
axes[0,1].set_title('Text Length Distribution')
axes[0,1].legend(['Fake', 'True'])

# Word count by label
sns.histplot(data=df, x='word_count', hue='label', bins=50, ax=axes[1,0])
axes[1,0].set_title('Word Count Distribution')
axes[1,0].legend(['Fake', 'True'])

# Label distribution pie chart
df['label'].value_counts().plot(kind='pie', ax=axes[1,1], 
                               labels=['Fake', 'True'], autopct='%1.1f%%')
axes[1,1].set_title('Label Distribution')

plt.tight_layout()
plt.show()

## Step 4: Subject Analysis

In [None]:
# Analyze subjects/topics
print("Subjects in the dataset:")
print(f"Total unique subjects: {df['subject'].nunique()}")
print("\nTop subjects:")
print(df['subject'].value_counts().head(10))

In [None]:
# Subject distribution by label
subject_label = df.groupby(['subject', 'label']).size().unstack(fill_value=0)
subject_label['total'] = subject_label.sum(axis=1)
subject_label = subject_label.sort_values('total', ascending=False).head(10)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Stacked bar chart
subject_label[[0, 1]].plot(kind='bar', stacked=True, ax=axes[0])
axes[0].set_title('Top 10 Subjects by Label')
axes[0].set_xlabel('Subject')
axes[0].set_ylabel('Count')
axes[0].legend(['Fake', 'True'])
axes[0].tick_params(axis='x', rotation=45)

# Proportional bar chart
subject_prop = subject_label[[0, 1]].div(subject_label[[0, 1]].sum(axis=1), axis=0)
subject_prop.plot(kind='bar', stacked=True, ax=axes[1])
axes[1].set_title('Top 10 Subjects by Proportion')
axes[1].set_xlabel('Subject')
axes[1].set_ylabel('Proportion')
axes[1].legend(['Fake', 'True'])
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Step 5: Word Clouds

In [None]:
# Create word clouds for fake and true news
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Fake news word cloud
fake_text = ' '.join(df[df['label'] == 0]['text'].head(1000))  # Use first 1000 for speed
wordcloud_fake = WordCloud(width=400, height=400, background_color='white').generate(fake_text)
axes[0].imshow(wordcloud_fake, interpolation='bilinear')
axes[0].set_title('Fake News Word Cloud', fontsize=16)
axes[0].axis('off')

# True news word cloud
true_text = ' '.join(df[df['label'] == 1]['text'].head(1000))  # Use first 1000 for speed
wordcloud_true = WordCloud(width=400, height=400, background_color='white').generate(true_text)
axes[1].imshow(wordcloud_true, interpolation='bilinear')
axes[1].set_title('True News Word Cloud', fontsize=16)
axes[1].axis('off')

plt.tight_layout()
plt.show()

## Step 6: Save Processed Dataset

In [None]:
# Save the combined and shuffled dataset
df.to_csv('../data/combined_news_dataset.csv', index=False)
print(f"Combined dataset saved with {len(df)} articles")

# Save a smaller sample for quick testing
sample_df = df.sample(n=5000, random_state=42)
sample_df.to_csv('../data/sample_news_dataset.csv', index=False)
print(f"Sample dataset saved with {len(sample_df)} articles for quick testing")

## Key Findings

1. **Dataset Size**: The dataset contains approximately 44,000 articles total
2. **Balance**: The dataset appears well-balanced between fake and true news
3. **Text Length**: Articles vary significantly in length, which may require preprocessing
4. **Subjects**: Multiple news subjects are represented, providing diversity
5. **Word Patterns**: Initial word clouds show different vocabulary patterns between fake and true news

## Next Steps
1. Implement text preprocessing pipeline
2. Build baseline models
3. Experiment with different NLP architectures