# Exploratory Data Analysis (EDA) for Drug Prediction Model

This notebook contains exploratory data analysis for the drug prediction dataset. The goal is to understand the data, visualize relationships, and prepare for modeling.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('../data/raw/drug200.csv')

# Display the first few rows of the dataset
df.head()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Visualize the distribution of the target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='Drug', data=df)
plt.title('Distribution of Drug Classes')
plt.xlabel('Drug Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Visualize relationships between features
plt.figure(figsize=(10, 6))
sns.boxplot(x='Drug', y='Age', data=df)
plt.title('Age Distribution by Drug Class')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation = df.corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Save the cleaned dataset for further processing
df.to_csv('../data/processed/drug200_cleaned.csv', index=False)