# US Accidents

Tujuan: Mengetahui pola kecelakaan di US serta faktor-faktor yang mungkin memengaruhi tingkat kecelakaannya

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
# Membaca dataset
df = pd.read_csv('E:\\Career\\Data Science\\Dataset\\US Accidents\\US_Accidents_March23.csv')
df.head()

## Handle Data Type

In [None]:
# Mengecek tipe data dan informasi dataset
df.info()

In [None]:
# Kolom severity menjadi target variabel
df['Severity'] = df['Severity'].astype('category')
df['Severity'].value_counts()

In [None]:
# Kolom Start_Time dan End_Time diubah menjadi tipe data datetime
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed', errors='coerce')
df['End_Time'] = pd.to_datetime(df['End_Time'], format='mixed', errors='coerce')
df[3639775:3639776]  # Menampilkan baris tertentu untuk verifikasi

In [None]:
# Kolom Weather_Timestamp diubah menjadi datetime
df['Weather_Timestamp'] = pd.to_datetime(df['Weather_Timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [None]:
# Kolom-kolom lain yang memiliki tipe data categorical
categorical_columns = ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
for col in categorical_columns:
    df[col] = df[col].astype('category')

In [None]:
df.info()

## Handle Missing Data

In [None]:
# Mengecek jumlah nilai yang hilang pada setiap kolom
df.isnull().sum()

In [None]:
# Jumlah baris yang hilang dalam persen
missing_percentage = df.isnull().mean() * 100
missing_percentage[missing_percentage > 0]

In [None]:
# Drop kolom 'End_Lat' dan 'End_Lng'
df = df.drop(columns=['End_Lat', 'End_Lng'])

# Drop kolom untuk missing value di bawah 5%
threshold = 5
cols_to_dropna = missing_percentage[missing_percentage < threshold].index
df = df.dropna(subset=cols_to_dropna)

# # Untuk kolom kategori
for col in df.select_dtypes(include='category').columns:
    df[col] = df[col].cat.add_categories('Unknown').fillna('Unknown')

# Untuk kolom numerik
for col in df.select_dtypes(include='number').columns:
    df[col] = df[col].fillna(df[col].median())

In [None]:
missing_percentage = df.isnull().mean() * 100
missing_percentage[missing_percentage > 0]

In [None]:
df.info()

## Visualize Data

In [None]:
# Distribusi variabel numerik
df.plot(kind='box', figsize=(20, 10), subplots=True, layout=(3, 4), sharex=False, sharey=False)

In [None]:
# Distribusi variabel kategorikal
df.select_dtypes(include='category').nunique()

In [None]:
# Distribusi variabel kategorikal
for col in df.select_dtypes(include='category').columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=col, data=df)
    plt.title(f'Distribusi {col}')
    plt.show()

In [None]:
# Tren kecelakaan per bulan
df['Month'] = df['Start_Time'].dt.month
df['Year'] = df['Start_Time'].dt.year
df.groupby(['Year', 'Month']).size().plot(kind='line')
plt.title('Jumlah Kecelakaan per Bulan')
plt.ylabel('Jumlah')
plt.show()

In [None]:
# Peta Lokasi Kecelakaan
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Start_Lng', y='Start_Lat', hue='Severity', data=df, alpha=0.6)
plt.title('Peta Lokasi Kecelakaan')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Severity')
plt.show()

In [None]:
# Korelasi
correlation_matrix = df.select_dtypes(include='number').corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Matriks Korelasi')
plt.show()

In [None]:
# Perbandingan antar kategori berdasarkan Severity
for col in df.select_dtypes(include='category').columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=col, hue='Severity', data=df)
    plt.title(f'Perbandingan {col} berdasarkan Severity')
    plt.show()

## Insight dan Interpretasi

Tuliskan insight dari visualisasi yang telah dibuat.