## Import 

### Import Libraries

In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# setting seaborn style
sns.set()

from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None) #Show all columns


## Import library that enables colours in Markdowns
from IPython.display import Markdown


### Import Datasets

In [123]:
train_data = pd.read_csv("train_data.csv", low_memory=False)
test_data = pd.read_csv("test_data.csv")

## 1. Dataset overview

### 1.1 Descriptive analysis

In [None]:
train_data.head()


In [None]:
train_data.tail()

In [None]:
test_data.head()

In [None]:
test_data.tail()

In [None]:
test_data.columns.values

In [129]:
# set "Claim Identifier" as the index for the train and test datasets
train_data = train_data.set_index("Claim Identifier")
test_data = test_data.set_index("Claim Identifier")

In [None]:
train_data.describe(include="all").T

In [None]:
train_data.info()

## DATA TYPES:
# Convert accident date (to date type)
# convert age (to int)
# Alternative dispute resolution (??)
# Assembly date (to date type)
# birth year (to int)
# C2 date (to date type)
# C3 date (to date type)
# First hearing date (to date)
# IME-4 count (to int)
# Industry code (object)
# OIICS Nature of Injury Description (to object) - all nulls (relevant??)
# WCIO cause of injury code (float to object)
# WCIO Nature of Injury Code (float to object)
# WCIO Part Of Body Code (float to object)
# Agreement reached (??) - boolean/ int(???)
# Number of dependents (to int)

## 2. Data Cleaning

### 2.1 Duplicates

In [None]:
duplicates_sum = train_data.duplicated().sum()
print(f"There are {duplicates_sum} in the whole training data set. \nThose duplicates are dropped in the cell below.")

In [133]:
train_data.drop_duplicates(inplace=True)

### 2.2 Target variable - Claim Injury Type Missing & Unique values

In [None]:
train_data["Claim Injury Type"].unique()

In [None]:
missing_data_target = train_data[train_data['Claim Injury Type'].isna()]
missing_data_target.head()

In [None]:
missing_data_target.info()
# all features with missing values on the target, are also missing, except Assembly Date -> not relevant (decision -> drop rows)

In [137]:
train_data= train_data.drop(missing_data_target.index)

Re chack for unique values in the target variable

In [None]:
train_data["Claim Injury Type"].unique().tolist()

### 2.3 All independent variables - Missing & Unique Values

##### a) Checking the number of missing values in each variable in the train_data (number and percentage)

In [None]:
missing_values = train_data.isnull().sum().sort_values(ascending=False)
print(missing_values)

In [None]:
missing_percentage = ((train_data.isnull().sum() / len(train_data)) * 100).sort_values(ascending=False)
print(missing_percentage)

## Visual Exploration of Variables Before Preprocessing the Data

## Define Metric and Non Metric Variables

In [141]:
### Define metric & non-metric features
metric_features = ["Age at Injury", "Average Weekly Wage", "Birth Year", "IME-4 Count", "Number of Dependents"]

non_metric_features = ["OIICS Nature of Injury Description","Industry Code", "WCIO Cause of Injury Code", "WCIO Nature of Injury Code", "WCIO Part Of Body Code", "Accident Date", "Alternative Dispute Resolution",
                       "Assembly Date", "Attorney/Representative", "C-2 Date", "C-3 Date", "Carrier Name", "Carrier Type", "County of Injury", "COVID-19 Indicator",
                       "District Name", "First Hearing Date", "Gender", "Industry Code Description", "Medical Fee Region", "WCIO Cause of Injury Description", 
                       "WCIO Nature of Injury Description", "WCIO Part Of Body Description", "Zip Code", "Agreement Reached", "WCB Decision"]


### Metric Variables

### Histograms

In [None]:
train_data['Age at Injury'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=20
)

plt.title('Histogram of Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Frequency') 
plt.show()


In [None]:
train_data['Average Weekly Wage'].plot(
    kind='hist',
    color='bisque',
    edgecolor='black',
    bins=50
)

plt.title('Histogram of Average Weekly Wage')
plt.xlabel('Average Weekly Wage')
plt.ylabel('Log10(Frequency)')
plt.yscale('log')  
plt.show()   

In [None]:
train_data['IME-4 Count'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=20
)

plt.title('Histogram of IME-4 Count')
plt.xlabel('IME-4 Count')
plt.ylabel('Log10(Frequency)')
plt.yscale('log')  
plt.show()   

In [None]:
train_data['Number of Dependents'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=5
)

plt.title('Histogram of Number of Dependents')
plt.xlabel('Number of Dependents')
plt.ylabel('Frequency') 
plt.show()

#Number of bins=?

In [None]:
train_data['Birth Year'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=5
)

plt.title('Histogram of Birth Year')
plt.xlabel('Birth Year')
plt.ylabel('Frequency') 
plt.show()

In [147]:
def plot_filtered_histogram(data, column_name, min_value, max_value, bins=20):
    
    filtered_data = train_data[(train_data[column_name] >= min_value) & (train_data[column_name] <= max_value)][column_name].dropna()
    
    #Histogram
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_data, bins=bins, color='bisque', edgecolor='black')
    plt.title(f'Histogram of {column_name} (range: {min_value} to {max_value})')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()


In [None]:
plot_filtered_histogram(train_data, column_name="Birth Year", min_value=1920, max_value=2024, bins=20)

## Non Metric Variables

### Histograms

In [None]:
for column in non_metric_features:
    top_categories = train_data[column].value_counts().head(10)

   
    plt.figure(figsize=(8, 4))
    sns.barplot(
        y=top_categories.index,  
        x=top_categories.values, 
        palette='tab20b', 
         hue=top_categories.index, 
        legend=False 
    )  

  
    
    plt.xlabel('Count', fontsize=14)
    plt.ylabel(column, fontsize=14)

   
    plt.tight_layout()
    plt.show()

In [150]:
#Codes?
#First Hearing Date?
#Agreement Reached?
#WCB Decision?

Variables such as ***OIICS Nature of Injury Descritpion, IME-4 Count, First Hearing Date*** and ***C-3 Date*** have more than 60% of missing values. 
<br> We decided to drop ***OIICS Nature of Injury Descritpion, IME-4 Count and ***C-3 Date*** becuase they have so many missing values that it does not make sense to impute them (not enough information for taking that as sample for the whole population).
<br> We do not drop ***First Hearing Date*** variable, because blank date (missing value) means the claim has not yet had a hearing held. Thus, it is an information that we want to keep.




In [151]:
columns_to_drop = ['IME-4 Count','C-3 Date', 'OIICS Nature of Injury Description']
train_data =train_data.drop(columns_to_drop, axis=1)

We decided to change vairable First Hearing Date to binary variable, where NaN values are replaced by 0 - it means that claim has not had a hearing held yet and dataes are replaced by 1 which means that hearings has been already held.

In [None]:
train_data['First Hearing Date Binary'] = train_data['First Hearing Date'].notna().astype(int)
train_data[['First Hearing Date', 'First Hearing Date Binary']].head()

In [None]:
train_data['First Hearing Date Binary'].describe()

First Hearing Data variable can be droped after creating binary variable.

In [154]:
train_data =train_data.drop("First Hearing Date", axis=1)

##### b) Checking unique values in each of the variable (looking for some values that can indicate missing values)

In [None]:
for column in train_data.columns:
    unique_values = train_data[column].unique()
    unique_values_num = train_data[column].nunique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print(f"Number of unique values in '{column}':")
    print(unique_values_num)
    
    print()

#### All listed Variables - summary 

1. **Accident Date**: 0.64% NaN and dates (5539 unique values)

2. **Age at Injury**: 0% NaN and no strange values (108 unique values) (there are some values >100)
3. **Alternative Dispute Resolution**: 0% NaN and no strange values (3 unique values)
4. **Assembly Date**: 0% NaN and dates (897 unique values)
5. **Attorney/Representative**: 0% NaN and no strange values (2 unique values)
6. **Average Weekly Wage**: around 5% NaN and a lot of unique values (120024 unique values)
7. **Birth Year**: 5% Nan and one 25081 strange values "0" (Birth Year cannot be 0) (107 unique values - 0 included)
8. **C-2 Date**: 2.5% NaN and dataes (2475 unique values)
11. **Carrier Name**: 0% NaN and name of carrier (2046 unique values) 
12. **Carrier Type**: 0% NaN and no strange values (8 unique values)
13. **Claim Injury Type** - target variable
14. **County of Injury**: 0% NaN and there are 1196 "UNKNOWN" values (63 unique values)
15. **COVID-19 Indicator**: 0% NaN and no strange values (2 unique values)
16. **District Name**: 0% NaN and no strange values (8 unique values)
17. **First Hearing Date**: 73.7% NaN and dates (1094 unique values)
18. **Gender**: 0% NaN and male, female, non-binary and unspecified (4 unique values)
19. **Industry Code**: 1.73% NaN and no strange values (24 unique values)
20. **Industry Code Description**: 1.73% NaN and descripion (20 unique values)
21. **Medical Fee Region**: 0% NaN and one strange value 'UK' (33472 such values) other are numbers (4 unique values)
22. **WCIO Cause of Injury Code**: 2.724% NaN and no strange values (77 unique values) 
23. **WCIO Cause of Injury Description**: 2.724% NaN and no strange values (74 unique values)
24. **WCIO Nature of Injury Code**: 2.727% NaN and no strange values (56 unique values)
25. **WCIO Nature of Injury Description**: 2.727% NaN and no strange values (56 unique values)
26. **WCIO Part Of Body Code**: 2.98% NaN and 42011 "-9" values (should be treated as missing value) (57 unique values)
27. **WCIO Part Of Body Description**: 2.98% NaN andone strange value: INSUFFICIENT INFO TO PROPERLY IDENTIFY - UNCLASSIFIED (54 unique values)
28. **Zip Code**: 4.98% NaN and zip code is a bit strange -> it has a lot of unique values, some zip codes starts with numbers and there are som that starts with letters; one zip code unique value is 'UNKNO". According to definition of Zip-Code: zip codes only contain a combination of numeric values.
29. **Agreement Reached**: 0% NaN and no strange values (2 unique values)
30. **WCB Decision**: 0% NaN and only 1 unique value -> variable that should be deleted it is not informative
31. **Number of Dependents**: 0% NaN and no strange values (7 unique values)



We decided to drop variavle WCB Decision, because it only contains one unique value, so it is not informative.

In [156]:
train_data =train_data.drop('WCB Decision', axis=1)

After reviewing unique values in the dataset we also found in some variables unique values such as e.g "UNKNOWN", "UK", "UNKNO". Also, we saw 
Therefore, we changed that values to missing values.

In [None]:
train_data['Zip Code'].replace('UNKNO', pd.NA)

In [None]:
train_data['Medical Fee Region'].replace('UK', pd.NA)

In [None]:
train_data['County of Injury'].replace('UNKNOWN',pd.NA)

There is also strange value in the variable WCIO Part Of Body Code. According WCIO website there is no such code as -9. We checked with variable WCIO Part of Body Description which description corresponds to Code -9 in our data set. We found out that -9 is always associated with description "Multiple".

In [None]:
# Display only rows where "WCIO Part Of Body Code" equals -9
filtered_data = train_data[train_data["WCIO Part Of Body Code"] == -9]

# Display the result
filtered_data["WCIO Part Of Body Description"].value_counts()


Acording to WCIO website codes that contains in the description word multiple are: 10, 20, 30, 40, 50, 90, 91. Thus, we decided to fill -9 those values with mode amonog codes that corresponds to multiple. We used mode to replace -9 becuase WCIO Part of Body Code is a categorical variable.

In [None]:
# Define the valid codes for "Multiple"
multiple_codes = [10, 20, 30, 40, 50, 90, 91]

# Calculate the mode among the valid multiple codes
mode_value = train_data[train_data["WCIO Part Of Body Code"].isin(multiple_codes)]["WCIO Part Of Body Code"].mode()[0]

# Replace only -9 values with the mode, leaving other NaN values unchanged
train_data.loc[train_data["WCIO Part Of Body Code"] == -9, "WCIO Part Of Body Code"] = mode_value

print(train_data["WCIO Part Of Body Code"].value_counts(dropna=False))


In [None]:
WCIO_columns = [
    'WCIO Part Of Body Code', 
    'WCIO Part Of Body Description', 
    'WCIO Nature of Injury Description', 
    'WCIO Nature of Injury Code', 
    'WCIO Cause of Injury Description', 
    'WCIO Cause of Injury Code'
]

# Display rows where all WCIO columns contain NaN values
WCIO_nan_rows = train_data[train_data[WCIO_columns].isna().all(axis=1)]

count_all_nan_WCIO = train_data[WCIO_columns].isna().all(axis=1).sum()
print("Rows with all NaN values in WCIO columns:", count_all_nan_WCIO)


# Calculate the percentage of these rows in the whole dataset
percentage_all_nan_WCIO = (count_all_nan_WCIO / len(train_data)) * 100

# Display the result
print(f"Percentage of rows with all NaN values in WCIO columns: {percentage_all_nan_WCIO:.2f}%")
WCIO_nan_rows.head()


Dropping rows where all WCIO columns contain NaN values.

In [163]:
train_data = train_data[~train_data[WCIO_columns].isna().all(axis=1)]

Checking the percentage of misssing values after deleting WCIO rows

In [None]:
missing_percentage = ((train_data.isnull().sum() / len(train_data)) * 100).sort_values(ascending=False)
missing_percentage = missing_percentage[missing_percentage > 0]
print(missing_percentage)

##### Dealing with remaning missing values

1. There is a chance that ***Birth Year*** is correlated with Age at the Injury, so we do not take care of missing values for now.

2. All missing values in the variable ***Average Weekly Wage*** are replaced by using median (it is a continous variable)

In [None]:
median_wage = train_data["Average Weekly Wage"].median()
train_data["Average Weekly Wage"].fillna(median_wage, inplace=True)

3. Variable ***Zip Code*** has a lot of unique values. Therefore, we decided to group Zip Code 

In [None]:
train_data["Zip Code"].value_counts(dropna=False).sort_index()

##### <span style="color: orange;">I am not sure what should we do with those strange values in Zip Code? (There are 11 such Zip Codes in general) Should we drop those rows??  For now i classified them with zip codes that starts with letters but i am not sure if its a good solution <br> See the cells below</span>

In [None]:
# Display only rows where "Zip Code" equals ".1605" or "00000"
filtered_data = train_data[(train_data["Zip Code"] == ".1605") | (train_data["Zip Code"] == "00000")]

# Display the result: count occurrences of each value in "WCIO Part Of Body Description"
filtered_data.value_counts()


In [None]:
# Creating new variable zip_code dividied into categories 
def categorize_zip(zip_code):
    if pd.isna(zip_code):  # Check if the value is NaN
        return np.nan 
    elif zip_code == ".1605" or zip_code == "00000":
        return "Other"
    elif isinstance(zip_code, str) and zip_code[0].isdigit():
        return zip_code[0]
    else:
        return "Other"


# Apply the function to create the new 'zip_code_cat' column
train_data['zip_code_cat'] = train_data['Zip Code'].apply(categorize_zip)

# If you want to see the distribution of the new category
print(train_data['zip_code_cat'].value_counts(dropna=False))


Replacing missing values in ***zip_code_cat*** with mode - categorical variable

In [None]:
mode_value = train_data['zip_code_cat'].mode()[0]  

# Replace NaN values in 'zip_code_cat' with the mode
train_data['zip_code_cat'].fillna(mode_value, inplace=True)

# Display the updated DataFrame and the distribution of the new category
train_data['zip_code_cat'].value_counts(dropna=False)  # Include NaN if still any 

Dropping variable ***Zip Code*** since we created zip_code_cat

In [170]:
train_data =train_data.drop("Zip Code", axis=1)

4. Variables that has less than 1% of with missing values

In [171]:
# List of categorical columns that has less than 1% of missing values
categorical_columns = [
    'WCIO Part Of Body Description',
    'WCIO Part Of Body Code',
    'Accident Date',
    'Industry Code',
    'Industry Code Description',
    'WCIO Nature of Injury Description',
    'WCIO Nature of Injury Code',
    'WCIO Cause of Injury Description',
    'WCIO Cause of Injury Code',
    'C-2 Date'
]

In [None]:
# Fill missing values with the mode for each specified categorical column
for column in categorical_columns:
    mode_value = train_data[column].mode()  
    if not mode_value.empty:  
        train_data[column].fillna(mode_value[0], inplace=True)  

In [None]:
# Final check if there are still any missing values. We will deal with Birth Year later.

missing_percentage = ((train_data.isnull().sum() / len(train_data)) * 100).sort_values(ascending=False)
print(missing_percentage)

##### <span style="color: orange;">I am not sure what should we do with those strange value in Gender. I would maybe just leave them as they are since it is hard to distinguish and actually say what U and X can mean. We should look at it more closely during data visualisation <br> I mostly checked values for gender X since there are only 41 such values. See the cells below</span>

In [None]:
train_data["Gender"].value_counts()

In [None]:
filtered_data_gender = train_data[train_data["Gender"] == "X"]
filtered_data_gender.value_counts()

## 3. Data Transformation

In [None]:
train_data.info()

In [177]:
train_data["Age at Injury"] = train_data["Age at Injury"].astype("Int64")

In [178]:
train_data["Number of Dependents"] = train_data["Number of Dependents"].astype("Int64")

In [179]:
train_data["Birth Year"] = train_data["Birth Year"].astype("Int64")

In [180]:
train_data["WCIO Part Of Body Code"] = train_data["WCIO Part Of Body Code"].astype("Int64")

In [181]:
train_data["WCIO Nature of Injury Code"] = train_data["WCIO Nature of Injury Code"].astype("Int64")

In [182]:
train_data["WCIO Cause of Injury Code"] = train_data["WCIO Cause of Injury Code"].astype("Int64")

In [183]:
train_data["Industry Code"] = train_data["Industry Code"].astype("Int64")

In [184]:
train_data["Accident Date"] = pd.to_datetime(train_data['Accident Date'], format = '%Y-%m-%d')

In [185]:
train_data["Assembly Date"] = pd.to_datetime(train_data['C-2 Date'], format = '%Y-%m-%d')

In [186]:
train_data["C-2 Date"] = pd.to_datetime(train_data['C-2 Date'], format = '%Y-%m-%d')

In [None]:
train_data.info()

In [188]:
#### Convert birth year into age:

In [189]:
train_data["Birth Year"]= datetime.now().year - train_data["Birth Year"]

#### Dates analysis:

In [None]:
train_data["Days Between Accident_Assembly"] = train_data['Assembly Date'] - train_data['Accident Date']
print(f"maximum days between assembly: {train_data['Days Between Accident_Assembly'].max()}")
print(f"minimum days between assembly: {train_data['Days Between Accident_Assembly'].min()}") 
# strange values assembly date < accident date?? 
# #same max & min as days between accident and C2.

In [None]:
train_data["Days Between Accident_C2"] = train_data['C-2 Date'] - train_data['Accident Date']
print(f"maximum days between assembly: {train_data['Days Between Accident_C2'].max()}")
print(f"minimum days between assembly: {train_data['Days Between Accident_C2'].min()}") 
# strange values C2 date < accident date?? 
# #same max & min as days between accident and C2.

In [192]:
# train_data["Days Between Accident_C3"] = train_data['C-3 Date'] - train_data['Accident Date']
# print(f"maximum days between assembly: {train_data['Days Between Accident_C3'].max()}")
# print(f"minimum days between assembly: {train_data['Days Between Accident_C3'].min()}")
# # strange values C3 date < accident date?? 

In [193]:
# train_data["Days Between Accident_1st_Hearing"] = train_data['First Hearing Date'] - train_data['Accident Date']
# print(f"maximum days between assembly: {train_data["Days Between Accident_1st_Hearing"].max()}")
# print(f"minimum days between assembly: {train_data["Days Between Accident_1st_Hearing"].min()}")
# # strange values 1st_Hearing < accident date?? 

#### Group descriptions by code: 

In [None]:
group_by_industry = train_data.groupby("Industry Code")["Industry Code Description"].unique()
group_by_industry

# Different codes with same description

In [None]:
group_by_injury = train_data.groupby("WCIO Cause of Injury Code")["WCIO Cause of Injury Description"].unique()
group_by_injury

# Different codes with same description

In [None]:
group_by_body_part = train_data.groupby("WCIO Part Of Body Code")["WCIO Part Of Body Description"].unique()
group_by_body_part

# different codes with the same description

## 4. Defining target variable Again

In [197]:
x = train_data.drop('Claim Injury Type', axis = 1) # axis=1 means the operation must be done on columns
y = train_data['Claim Injury Type'] 

In [219]:
### Define metric & non-metric features
metric_features_new = ["Age at Injury", "Average Weekly Wage", "Birth Year", "IME-4 Count", "Number of Dependents", "Days Between Accident_Assembly", "Days Between Accident_C2", 
                   "Days Between Accident_C3", "Days Between Accident_1st_Hearing"]

non_metric_features_new = ["Industry Code", "WCIO Cause of Injury Code", "WCIO Nature of Injury Code", "WCIO Part Of Body Code", "Accident Date", "Alternative Dispute Resolution",
                       "Assembly Date", "Attorney/Representative", "C-2 Date", "Carrier Name", "Carrier Type", "County of Injury", "COVID-19 Indicator",
                       "District Name", "First Hearing Date Binary", "Gender", "Industry Code Description", "Medical Fee Region", "WCIO Cause of Injury Description", 
                       "WCIO Nature of Injury Description", "WCIO Part Of Body Description", "zip_code_cat", "Agreement Reached"]



## Visual Exploration of Variables Again

## Metric Variables

### 
Birth Year has the info about age now

In [None]:
train_data['Age at Injury'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=20
)

plt.title('Histogram of Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Frequency') 
plt.show()

In [None]:
train_data['Average Weekly Wage'].plot(
    kind='hist',
    color='bisque',
    edgecolor='black',
    bins=50
)

plt.title('Histogram of Average Weekly Wage')
plt.xlabel('Average Weekly Wage')
plt.ylabel('Log10(Frequency)')
plt.yscale('log')  
plt.show()

In [None]:
train_data['Number of Dependents'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=5
)

plt.title('Histogram of Number of Dependents')
plt.xlabel('Number of Dependents')
plt.ylabel('Frequency') 
plt.show()

#Number of bins=?

In [None]:
train_data['Birth Year'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=5
)

plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency') 
plt.show()

In [206]:
def plot_filtered_histogram_a(data, column_name, min_value, max_value, bins=20):
    
    filtered_data = train_data[(train_data[column_name] >= min_value) & (train_data[column_name] <= max_value)][column_name].dropna()
    
    #Histogram
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_data, bins=bins, color='bisque', edgecolor='black')
    plt.title(f'Histogram of {'Age'} (range: {min_value} to {max_value})')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()

In [None]:
plot_filtered_histogram_a(train_data, column_name="Birth Year", min_value=0, max_value=100, bins=20)

In [None]:
# Convert timedelta to day
train_data['Days Between Accident_Assembly'] = train_data['Days Between Accident_Assembly'].dt.days
train_data['Days Between Accident_Assembly'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=20  
)

plt.title('Histogram of Days Between Accident Assembly')
plt.xlabel('Days Between Accident Assembly (days)')
plt.ylabel('Log10(Frequency)')
plt.yscale('log')  
plt.show()

In [None]:
train_data['Days Between Accident_C2'].plot(
    kind='hist',
    color='bisque', 
    edgecolor='black',
    bins=20 
)

plt.title('Histogram of Days Between Accident_C2')
plt.xlabel('Days Between Accident_C2 (days)')
plt.ylabel('Log10(Frequency)')
plt.yscale('log')
plt.show()

In [215]:
#Days Between Accident C3?
#Days Between Accident_1st_Hearing?

## Non Metric Variables

### Histograms

In [None]:
for column in non_metric_features_new:
    top_categories = train_data[column].value_counts().head(10)

   
    plt.figure(figsize=(8, 4))
    sns.barplot(
        y=top_categories.index,  
        x=top_categories.values, 
        palette='tab20b', 
         hue=top_categories.index, 
        legend=False 
    )  

  
    
    plt.xlabel('Count', fontsize=14)
    plt.ylabel(column, fontsize=14)

   
    plt.tight_layout()
    plt.show()