In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import fit transform
from sklearn.model_selection import train_test_split

from scripts.processing import clean_data, pre_process_data
from scripts.tp_col_groups import TP_REGION_COLS, TP_INJURY_COLS, TP_TYPE_COLS

pd.set_option('display.max_columns', None)

In [None]:
raw_train_data = pd.read_csv('data/train.csv')
train_data = clean_data(raw_train_data)

train_data = train_data[(train_data['Incurred'] - train_data['Incurred'].mean()) / train_data['Incurred'].std() < 2]

In [None]:
# Exploration plots


# train_data.plot.scatter(x='Incurred', y='Incurred', alpha=0.5) 

# sns.catplot(x='Weather_conditions', y='Incurred', data=train_data, kind='box')

correlation_matrix = train_data.corr(numeric_only=True)
plt.figure(figsize=(20, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f")
plt.show()

In [None]:
train_data.columns

In [None]:
# get total column by adding all values from the TP_TYPE list
train_data['Total'] = train_data[TP_REGION_COLS].sum(axis=1)

In [None]:
train_data.plot.scatter(x='Total', y='Incurred', alpha=0.5)
train_data.groupby('Total')['Incurred'].mean().plot()

In [None]:
for col in TP_TYPE_COLS:
    train_data.plot.scatter(x=col, y='Incurred', alpha=0.5)
    train_data.groupby(col)['Incurred'].mean().plot()

In [None]:
for col in TP_REGION_COLS:
    train_data.plot.scatter(x=col, y='Incurred', alpha=0.5)
    train_data.groupby(col)['Incurred'].mean().plot()

In [None]:
for col in TP_INJURY_COLS:
    train_data.plot.scatter(x=col, y='Incurred', alpha=0.5)
    train_data.groupby(col)['Incurred'].mean().plot()

In [None]:
train_data.plot.scatter(x='Notification_period', y='Incurred', alpha=0.5)

In [None]:
sns.catplot(x='Location_of_incident', y='Incurred', data=train_data, kind='box')

In [None]:
sns.catplot(x='Weather_conditions', y='Incurred', data=train_data, kind='box')

In [None]:
# NEW COLUMN CALLED IS_NIGHT
# 1 if Time_hour is greater than or equal to 20 or less than or equal to 5 otherwiser 0
train_data['is_night'] = train_data['Time_hour'].apply(lambda x: 1 if x >= 20 or x <= 5 else 0)

In [None]:
train_data

In [None]:
sns.catplot(x='is_night', y='Incurred', data=train_data, kind='box')

In [None]:
sns.catplot(x='Incident_details_present', y='Incurred', data=train_data, kind='box')

In [None]:
train_data.plot.scatter(x='days_since_loss', y='Incurred', alpha=0.5)

# Print correlation between Incurred and days_since_loss
print(train_data['Incurred'].corr(train_data['days_since_loss']))

In [None]:
train_data.plot.scatter(x='TP_region_northw', y='Incurred', alpha=0.5)
train_data.groupby('TP_region_northw')['Incurred'].mean().plot()

In [None]:
train_data.plot.scatter(x='TP_injury_whiplash', y='Incurred', alpha=0.5)
train_data.groupby('TP_injury_whiplash')['Incurred'].mean().plot()

In [None]:
train_data.plot.scatter(x='TP_injury_fatality', y='Incurred', alpha=0.5)
train_data.groupby('TP_injury_fatality')['Incurred'].mean().plot()

In [None]:
train_data.plot.scatter(x='Time_hour', y='Incurred', alpha=0.5)

# Plot median Incurred per hour
train_data.groupby('Time_hour')['Incurred'].mean().plot()

In [None]:
train_data.groupby('Time_hour')['Incurred'].mean()