# Exploratory Data Analysis

This notebook performs Exploratory Data Analysis on the insurance dataset

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def load_data(path='data/train.csv'):
    data = pd.read_csv(path)
    return data

data = load_data()
data.head()

### Missing Values by Feature

In [None]:
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]

if len(missing_values) > 0:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_values.index, y=missing_values.values)
    plt.title('Missing Values by Feature')
    plt.xticks(rotation=45)
    plt.xlabel('Features')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset.")

### Distribution of Premium Amount

In [None]:
plt.figure(figsize=(8, 6))

sns.histplot(data['Premium Amount'], kde=True)
plt.title('Distribution of Premium Amount')
plt.xlabel('Premium Amount ($)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### Annual Income vs Premium Amount

In [None]:
plt.figure(figsize=(10, 6))

sns.scatterplot(x='Annual Income', y='Premium Amount', data=data, alpha=0.6)
plt.title('Annual Income vs Premium Amount')

sns.regplot(x='Annual Income', y='Premium Amount', data=data, scatter=False, color='red')

corr = data['Annual Income'].corr(data['Premium Amount'])
plt.annotate(f'Correlation: {corr:.2f}', xy=(0.05, 0.95), xycoords='axes fraction',
            fontsize=12, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", alpha=0.8))

plt.xlabel('Annual Income ($)')
plt.ylabel('Premium Amount ($)')
plt.tight_layout()
plt.show()

### Average Premium by Gender

In [None]:
plt.figure(figsize=(8, 6))

avg_premium = data.groupby('Gender')['Premium Amount'].mean().sort_values(ascending=False).reset_index()

sns.barplot(x='Gender', y='Premium Amount', data=avg_premium)
plt.title('Average Premium by Gender', pad=15, fontsize=12)
plt.xlabel('Gender', labelpad=10, fontsize=10)
plt.ylabel('Average Premium Amount ($)', labelpad=10, fontsize=10)

for i, v in enumerate(avg_premium['Premium Amount']):
    plt.text(i, v + 50, f'${v:.2f}', ha='center', fontsize=10)

plt.margins(y=0.15)
plt.tight_layout()
plt.show()

### Impact of Categorical Features on Premium

In [None]:
numerical_data = data.select_dtypes(include=['number'])
categorical_data = data.select_dtypes(include=['object'])

categorical_means = {}
for col in categorical_data.columns:
    if col != 'Policy Start Date':
        means = data.groupby(col)['Premium Amount'].mean()
        categorical_means[col] = means.max() - means.min()

categorical_impact = pd.Series(categorical_means).sort_values(ascending=False)

top_5_categorical = categorical_impact.head(5)

plt.figure(figsize=(8, 6))
plt.suptitle('Impact of Categorical Features on Premium', fontsize=14, y=0.95)
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#ff99cc']
plt.pie(top_5_categorical.values, labels=top_5_categorical.index, autopct='%.0f%%',
        colors=colors, startangle=90)
plt.axis('equal')
plt.show()

### Average Premium Amount by Vehicle Age

In [None]:
age_premium = data.groupby('Vehicle Age')['Premium Amount'].mean().reset_index()

plt.figure(figsize=(12, 8))
plt.plot(age_premium['Vehicle Age'], age_premium['Premium Amount'], 
         marker='o', linewidth=2, markersize=8)

plt.title('Average Premium Amount by Vehicle Age', pad=20, fontsize=14)
plt.xlabel('Vehicle Age (Years)', labelpad=15, fontsize=12)
plt.ylabel('Average Premium Amount ($)', labelpad=15, fontsize=12)

for x, y in zip(age_premium['Vehicle Age'], age_premium['Premium Amount']):
    plt.text(x, y + 20, f'${y:.0f}', ha='center', va='bottom', fontsize=11)  

plt.grid(True, linestyle='--', alpha=0.7)
plt.margins(y=0.1)
plt.subplots_adjust(left=0.12, right=0.95, top=0.85, bottom=0.12)
plt.show()