# Data Exploration and Preparation

This notebook covers:
- Generating synthetic dataset
- Exploratory data analysis
- Feature engineering
- Uploading data to S3

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import sagemaker
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

print(f'SageMaker bucket: {bucket}')
print(f'Region: {region}')
print(f'Role: {role}')

In [None]:
# Generate dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=2,
    weights=[0.7, 0.3],
    flip_y=0.05,
    random_state=42
)

feature_names = [f'feature_{i}' for i in range(20)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print(f'Dataset shape: {df.shape}')
print(f'Class distribution:\n{df["target"].value_counts(normalize=True)}')

In [None]:
# Basic statistics
df.describe()

In [None]:
# Class distribution plot
plt.figure(figsize=(8, 5))
df['target'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Feature distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.ravel()

for idx, col in enumerate(feature_names[:6]):
    axes[idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
corr_matrix = df[feature_names[:10]].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Feature engineering
df['feature_0_x_1'] = df['feature_0'] * df['feature_1']
df['feature_2_squared'] = df['feature_2'] ** 2
df['feature_sum'] = df[feature_names[:5]].sum(axis=1)
df['feature_mean'] = df[feature_names[:5]].mean(axis=1)

print('Added engineered features')

In [None]:
# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])

print(f'Training set: {train_df.shape}')
print(f'Test set: {test_df.shape}')

In [None]:
# Upload to S3
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

prefix = 'classification-demo'
train_s3_path = sess.upload_data('train.csv', bucket=bucket, key_prefix=f'{prefix}/data')
test_s3_path = sess.upload_data('test.csv', bucket=bucket, key_prefix=f'{prefix}/data')

print(f'Training data: {train_s3_path}')
print(f'Test data: {test_s3_path}')

In [None]:
# Save for next notebook
%store train_s3_path
%store test_s3_path
%store bucket
%store role