# Data Exploration

This notebook is used for exploring the synthetic dataset generated for predicting youth classification. It includes visualizations and analyses to understand the relationships between features.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the synthetic dataset
data = pd.read_csv('../data/synthetic_data.csv')

# Display the first few rows of the dataset
data.head()

In [2]:
# Summary statistics of the dataset
data.describe(include='all')

In [3]:
# Visualizing the distribution of youth classification
plt.figure(figsize=(10, 6))
sns.countplot(x='youthClassification', data=data)
plt.title('Distribution of Youth Classification')
plt.xlabel('Youth Classification')
plt.ylabel('Count')
plt.show()

In [4]:
# Visualizing relationships between features
plt.figure(figsize=(12, 8))
sns.boxplot(x='youthClassification', y='age', data=data)
plt.title('Age Distribution by Youth Classification')
plt.xlabel('Youth Classification')
plt.ylabel('Age')
plt.show()

In [5]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation = data.corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()