# Simple AI Project - Data Exploration

This notebook explores the email spam classification dataset and demonstrates the project workflow.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Set style
plt.style.use('default')
sns.set_palette("husl")

## 1. Load and Explore Data

In [None]:
# Create sample data
data = {
    'text': [
        'Free money now! Click here!',
        'Meeting tomorrow at 3 PM',
        'You won a prize! Claim now!',
        'Project update: Q4 results',
        'Limited time offer! Buy now!',
        'Team lunch this Friday',
        'Urgent: Your account is suspended',
        'Weekly report attached',
        'Exclusive deal for you!',
        'Code review meeting notes'
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = spam, 0 = legitimate
}

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Data Preprocessing

In [None]:
# Create features
df['length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()
df['has_exclamation'] = df['text'].str.contains('!').astype(int)
df['has_urgent'] = df['text'].str.contains('urgent|free|prize|offer|deal').astype(int)

print("Features created:")
print(df[['length', 'word_count', 'has_exclamation', 'has_urgent']].describe())

## 3. Data Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Text length distribution
axes[0, 0].hist(df[df['label'] == 0]['length'], alpha=0.7, label='Legitimate', bins=5)
axes[0, 0].hist(df[df['label'] == 1]['length'], alpha=0.7, label='Spam', bins=5)
axes[0, 0].set_title('Text Length Distribution')
axes[0, 0].set_xlabel('Length')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()

# Word count distribution
axes[0, 1].hist(df[df['label'] == 0]['word_count'], alpha=0.7, label='Legitimate', bins=5)
axes[0, 1].hist(df[df['label'] == 1]['word_count'], alpha=0.7, label='Spam', bins=5)
axes[0, 1].set_title('Word Count Distribution')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# Feature distribution
features = ['has_exclamation', 'has_urgent']
feature_counts = df[features].sum()
axes[1, 0].bar(feature_counts.index, feature_counts.values)
axes[1, 0].set_title('Feature Distribution')
axes[1, 0].set_ylabel('Count')

# Label distribution
label_counts = df['label'].value_counts()
axes[1, 1].pie(label_counts.values, labels=['Legitimate', 'Spam'], autopct='%1.1f%%')
axes[1, 1].set_title('Label Distribution')

plt.tight_layout()
plt.show()

## 4. Model Training

In [None]:
# Prepare features and target
features = ['length', 'word_count', 'has_exclamation', 'has_urgent']
X = df[features]
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Spam']))

## 5. Results Summary

In [None]:
print("📊 Project Summary:")
print(f"   • Dataset size: {len(df)} samples")
print(f"   • Features: {len(features)}")
print(f"   • Model accuracy: {accuracy:.2%}")
print(f"   • Training samples: {len(X_train)}")
print(f"   • Test samples: {len(X_test)}")

print("\n✅ Simple AI Project completed successfully!")