## Summary

‚úÖ **LIAR Dataset Overview:**
- **Total Claims**: ~12,836 labeled claims
- **Labels**: 6 categories (true, mostly-true, half-true, false, barely-true, pants-fire)
- **Features**: Speaker, context, job title, party affiliation, and counts of different labels
- **Task**: Convert 6 labels ‚Üí 3 labels (REAL, FAKE, NOT_ENOUGH_INFO)

**Next Step**: Run Notebook 02 for Data Preprocessing & Label Conversion üöÄ

In [1]:
# Display sample claims by label
print("=" * 60)
print("SAMPLE CLAIMS BY LABEL")
print("=" * 60)

for label in train_df['label'].unique():
    print(f"\nüìå Label: {label.upper()}")
    sample = train_df[train_df['label'] == label]['claim'].iloc[0]
    print(f"   {sample[:100]}...")
    print(f"   Speaker: {train_df[train_df['label'] == label]['speaker'].iloc[0]}")

print("\n‚úÖ Sample exploration complete!")

SAMPLE CLAIMS BY LABEL


NameError: name 'train_df' is not defined

## Step 6: Sample Claims Exploration

In [None]:
# Check for missing values
print("=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)

print("\nüîç Train set missing values:")
print(train_df.isnull().sum())

print("\nüîç Test set missing values:")
print(test_df.isnull().sum())

print("\nüîç Validation set missing values:")
print(val_df.isnull().sum())

# Visualize missing values
fig, ax = plt.subplots(figsize=(10, 4))
missing_counts = train_df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0]
if len(missing_counts) > 0:
    missing_counts.plot(kind='barh', color='red', ax=ax)
    ax.set_title('Missing Values in Training Set')
else:
    ax.text(0.5, 0.5, 'No missing values!', ha='center', va='center', fontsize=14)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

print("‚úÖ Missing values analysis complete!")

## Step 5: Missing Values Analysis

In [None]:
# Text Statistics
print("=" * 60)
print("TEXT STATISTICS")
print("=" * 60)

# Add text length columns
for df in [train_df, test_df, val_df]:
    df['claim_length'] = df['claim'].str.len()
    df['claim_word_count'] = df['claim'].str.split().str.len()

print("\nüìù Claim Length Statistics (Training set):")
print(train_df['claim_length'].describe())

print("\nüìù Claim Word Count Statistics (Training set):")
print(train_df['claim_word_count'].describe())

# Plot text length distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(train_df['claim_length'], bins=50, color='coral', edgecolor='black')
axes[0].set_title('Distribution of Claim Length (characters)')
axes[0].set_xlabel('Length')
axes[0].set_ylabel('Frequency')

axes[1].hist(train_df['claim_word_count'], bins=50, color='lightgreen', edgecolor='black')
axes[1].set_title('Distribution of Claim Word Count')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("‚úÖ Text statistics analyzed!")

## Step 4: Analyze Text Statistics

In [None]:
# Label Distribution
print("=" * 60)
print("LABEL DISTRIBUTION")
print("=" * 60)

labels = ['mostly-true', 'true', 'half-true', 'false', 'barely-true', 'pants-fire']

print("\nüìä Train set labels:")
print(train_df['label'].value_counts().sort_index())

print("\nüìä Test set labels:")
print(test_df['label'].value_counts().sort_index())

print("\nüìä Validation set labels:")
print(val_df['label'].value_counts().sort_index())

# Plot label distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (df, title) in enumerate([(train_df, 'Train'), (test_df, 'Test'), (val_df, 'Validation')]):
    label_counts = df['label'].value_counts()
    axes[idx].bar(label_counts.index, label_counts.values, color='steelblue')
    axes[idx].set_title(f'{title} Set Label Distribution')
    axes[idx].set_xlabel('Label')
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("‚úÖ Label distribution visualized!")

## Step 3: Analyze Label Distribution

In [None]:
# Display basic information
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"\nüìä Train set shape: {train_df.shape}")
print(f"üìä Test set shape: {test_df.shape}")
print(f"üìä Validation set shape: {val_df.shape}")

total_samples = len(train_df) + len(test_df) + len(val_df)
print(f"\nüìà Total samples: {total_samples:,}")

# Display column info
print("\nüìã Column Info:")
print(train_df.info())

# Display first few rows
print("\nüîç First 3 rows of training data:")
print(train_df.head(3))

In [None]:
# List available files in data directory
print("üìÅ Files in data directory:")
for file in os.listdir('./data'):
    print(f"  - {file}")

# LIAR dataset column names
columns = ['claim_id', 'claim', 'label', 'speaker', 'speaker_job_title', 
           'state_info', 'party_affiliation', 'barely_true_counts', 
           'false_counts', 'half_true_counts', 'mostly_true_counts', 
           'pants_on_fire_counts', 'context']

# Load train, test, validation sets
train_df = pd.read_csv('./data/train.tsv', sep='\t', header=None, names=columns, on_bad_lines='skip')
test_df = pd.read_csv('./data/test.tsv', sep='\t', header=None, names=columns, on_bad_lines='skip')
val_df = pd.read_csv('./data/val.tsv', sep='\t', header=None, names=columns, on_bad_lines='skip')

print("‚úÖ Datasets loaded successfully!")

## Step 2: Load and Explore Dataset Structure

In [None]:
# For Colab: Download LIAR dataset from Kaggle
import os

# Create data directory if it doesn't exist
os.makedirs('./data', exist_ok=True)

# Download LIAR dataset (requires Kaggle API setup)
os.system('kaggle datasets download -d liar-dataset/liar-plus --unzip -p ./data')

print("‚úÖ Dataset downloaded to ./data/")

## Step 1: Download LIAR Dataset

**Note for Colab**: Follow these steps:
1. Go to https://www.kaggle.com/
2. Create/login to account
3. Go to Settings ‚Üí API ‚Üí "Create New API Token"
4. Upload kaggle.json to Colab (or use !mkdir ~/.kaggle and paste credentials)
5. Run cells below to download dataset

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import Counter

# Set random seed for reproducibility
np.random.seed(42)

print("‚úÖ All libraries imported successfully!")

In [None]:
# Install required packages
import subprocess
import sys

packages = ['pandas', 'numpy', 'matplotlib', 'seaborn', 'kaggle']

for package in packages:
    try:
        __import__(package)
        print(f"‚úÖ {package} already installed")
    except ImportError:
        print(f"üì¶ Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"‚úÖ {package} installed")

# 01 - LIAR Dataset Exploration
## Fake News Detection - Data Acquisition & EDA

This notebook downloads and explores the LIAR dataset for fake news detection.