## Import 

### Import Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# setting seaborn style
sns.set()

from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None) #Show all columns


### Import Datasets

In [4]:
train_data = pd.read_csv("project_data/train_data.csv", low_memory=False)
test_data = pd.read_csv("project_data/test_data.csv")

In [5]:
print(train_data.head())
print(test_data.head())

  Accident Date  Age at Injury Alternative Dispute Resolution Assembly Date  \
0    2019-12-30           31.0                              N    2020-01-01   
1    2019-08-30           46.0                              N    2020-01-01   
2    2019-12-06           40.0                              N    2020-01-01   
3           NaN            NaN                            NaN    2020-01-01   
4    2019-12-30           61.0                              N    2020-01-01   

  Attorney/Representative  Average Weekly Wage  Birth Year    C-2 Date  \
0                       N                 0.00      1988.0  2019-12-31   
1                       Y              1745.93      1973.0  2020-01-01   
2                       N              1434.80      1979.0  2020-01-01   
3                     NaN                  NaN         NaN         NaN   
4                       N                  NaN      1958.0  2019-12-31   

     C-3 Date                  Carrier Name  ... WCIO Cause of Injury Code  \
0 

In [6]:
print(train_data.tail())
print(test_data.tail())

       Accident Date  Age at Injury Alternative Dispute Resolution  \
593466           NaN            NaN                            NaN   
593467    2022-12-13           72.0                              N   
593468           NaN            NaN                            NaN   
593469           NaN            NaN                            NaN   
593470           NaN            NaN                            NaN   

       Assembly Date Attorney/Representative  Average Weekly Wage  Birth Year  \
593466    2022-12-31                     NaN                  NaN         NaN   
593467    2022-12-31                       N                  0.0      1950.0   
593468    2022-12-31                     NaN                  NaN         NaN   
593469    2022-12-31                     NaN                  NaN         NaN   
593470    2022-12-31                     NaN                  NaN         NaN   

          C-2 Date C-3 Date                   Carrier Name  ...  \
593466         NaN      N

### Descriptive analysis

In [7]:
# set "Claim Identifier" as the index for the train and test datasets
train_data = train_data.set_index("Claim Identifier")
test_data = test_data.set_index("Claim Identifier")

In [8]:
train_data.columns.values

array(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name',
       'Carrier Type', 'Claim Injury Type', 'County of Injury',
       'COVID-19 Indicator', 'District Name', 'First Hearing Date',
       'Gender', 'IME-4 Count', 'Industry Code',
       'Industry Code Description', 'Medical Fee Region',
       'OIICS Nature of Injury Description', 'WCIO Cause of Injury Code',
       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code',
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code',
       'WCIO Part Of Body Description', 'Zip Code', 'Agreement Reached',
       'WCB Decision', 'Number of Dependents'], dtype=object)

In [9]:
train_data.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Accident Date,570337.0,5539.0,2020-03-01,1245.0,,,,,,,
Age at Injury,574026.0,,,,42.11427,14.256432,0.0,31.0,42.0,54.0,117.0
Alternative Dispute Resolution,574026.0,3.0,N,571412.0,,,,,,,
Assembly Date,593471.0,1096.0,2020-03-06,1422.0,,,,,,,
Attorney/Representative,574026.0,2.0,N,392291.0,,,,,,,
Average Weekly Wage,545375.0,,,,491.088321,6092.91812,0.0,0.0,0.0,841.0,2828079.0
Birth Year,544948.0,,,,1886.767604,414.644423,0.0,1965.0,1977.0,1989.0,2018.0
C-2 Date,559466.0,2475.0,2021-05-11,1847.0,,,,,,,
C-3 Date,187245.0,1648.0,2021-04-21,350.0,,,,,,,
Carrier Name,574026.0,2046.0,STATE INSURANCE FUND,111144.0,,,,,,,


In [10]:
train_data.info()

## DATA TYPES:
# Convert accident date (to date type)
# convert age (to int)
# Alternative dispute resolution (??)
# Assembly date (to date type)
# birth year (to int)
# C2 date (to date type)
# C3 date (to date type)
# First hearing date (to date)
# IME-4 count (to int)
# Industry code (object)
# OIICS Nature of Injury Description (to object) - all nulls (relevant??)
# WCIO cause of injury code (float to object)
# WCIO Nature of Injury Code (float to object)
# WCIO Part Of Body Code (float to object)
# Agreement reached (??) - boolean/ int(???)
# Number of dependents (to int)

<class 'pandas.core.frame.DataFrame'>
Index: 593471 entries, 5393875 to 818961390
Data columns (total 32 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       570337 non-null  object 
 1   Age at Injury                       574026 non-null  float64
 2   Alternative Dispute Resolution      574026 non-null  object 
 3   Assembly Date                       593471 non-null  object 
 4   Attorney/Representative             574026 non-null  object 
 5   Average Weekly Wage                 545375 non-null  float64
 6   Birth Year                          544948 non-null  float64
 7   C-2 Date                            559466 non-null  object 
 8   C-3 Date                            187245 non-null  object 
 9   Carrier Name                        574026 non-null  object 
 10  Carrier Type                        574026 non-null  object 
 11  Claim Injury Type     

#### Check for duplicates & missing values in target variable:

In [11]:
duplicates =train_data.duplicated().value_counts()
duplicates

False    575121
True      18350
Name: count, dtype: int64

In [12]:
train_data["Claim Injury Type"].unique()

array(['2. NON-COMP', '4. TEMPORARY', nan, '3. MED ONLY',
       '5. PPD SCH LOSS', '6. PPD NSL', '1. CANCELLED', '8. DEATH',
       '7. PTD'], dtype=object)

In [13]:
missing_data_target = train_data[train_data['Claim Injury Type'].isna()]
missing_data_target.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
957648180,,,,2020-01-01,,,,,,,...,,,,,,,,,,
363651794,,,,2020-01-01,,,,,,,...,,,,,,,,,,
209507552,,,,2020-01-01,,,,,,,...,,,,,,,,,,
970865999,,,,2020-01-01,,,,,,,...,,,,,,,,,,
807753247,,,,2020-01-01,,,,,,,...,,,,,,,,,,


In [595]:
missing_data_target.info()
# all features with missing values on the target, are also missing, except Assembly Date -> not relevant (decision -> drop rows)

<class 'pandas.core.frame.DataFrame'>
Index: 19445 entries, 957648180 to 818961390
Data columns (total 32 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Accident Date                       0 non-null      object 
 1   Age at Injury                       0 non-null      float64
 2   Alternative Dispute Resolution      0 non-null      object 
 3   Assembly Date                       19445 non-null  object 
 4   Attorney/Representative             0 non-null      object 
 5   Average Weekly Wage                 0 non-null      float64
 6   Birth Year                          0 non-null      float64
 7   C-2 Date                            0 non-null      object 
 8   C-3 Date                            0 non-null      object 
 9   Carrier Name                        0 non-null      object 
 10  Carrier Type                        0 non-null      object 
 11  Claim Injury Type                 

#### Drop 'OIICS Nature of Injury Description' feature & rows with missing target values 

In [596]:
train_data = train_data.drop(missing_data_target.index)

In [597]:
train_data=train_data.drop(['OIICS Nature of Injury Description'], axis=1)

#### Re-Check for duplicates 

In [598]:
# check for duplicate values on train and test dataset:
print(f"Train dataset:\n{train_data.duplicated().value_counts()}") # true: nr of duplicated rows in dataset

Train dataset:
False    574025
True          1
Name: count, dtype: int64


In [599]:
# Check the percentage of duplicates for the entire training dataset
train_data.duplicated().mean()*100
# conclusion: Not relevant for analysis, drop duplicates

0.00017420813691365895

In [600]:
train_data.drop_duplicates(inplace=True)
train_data.duplicated().value_counts()
# Remove duplicates

False    574025
Name: count, dtype: int64

#### Re check descriptives:

In [601]:
train_data.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Accident Date,570337.0,5539.0,2020-03-01,1245.0,,,,,,,
Age at Injury,574025.0,,,,42.114343,14.256336,0.0,31.0,42.0,54.0,117.0
Alternative Dispute Resolution,574025.0,3.0,N,571411.0,,,,,,,
Assembly Date,574025.0,897.0,2020-03-06,1413.0,,,,,,,
Attorney/Representative,574025.0,2.0,N,392291.0,,,,,,,
Average Weekly Wage,545374.0,,,,491.089221,6092.92367,0.0,0.0,0.0,841.0,2828079.0
Birth Year,544947.0,,,,1886.767454,414.644789,0.0,1965.0,1977.0,1989.0,2018.0
C-2 Date,559466.0,2475.0,2021-05-11,1847.0,,,,,,,
C-3 Date,187244.0,1648.0,2021-04-21,350.0,,,,,,,
Carrier Name,574025.0,2046.0,STATE INSURANCE FUND,111144.0,,,,,,,


#### Missing Values: (Drop/Don't drop??)

In [602]:
# Check for missing values on train dataset: 
missing_values_train = train_data.isna().sum()
missing_values_train_percentage = (train_data.isna().mean()*100).round(2)
print("Missing values\n",missing_values_train,"\n")
print("Percentage of Missing values\n",missing_values_train_percentage)

Missing values
 Accident Date                          3688
Age at Injury                             0
Alternative Dispute Resolution            0
Assembly Date                             0
Attorney/Representative                   0
Average Weekly Wage                   28651
Birth Year                            29078
C-2 Date                              14559
C-3 Date                             386781
Carrier Name                              0
Carrier Type                              0
Claim Injury Type                         0
County of Injury                          0
COVID-19 Indicator                        0
District Name                             0
First Hearing Date                   423228
Gender                                    0
IME-4 Count                          441222
Industry Code                          9957
Industry Code Description              9957
Medical Fee Region                        0
WCIO Cause of Injury Code             15639
WCIO Cause of In

#### - Unique values (search for strange values)

In [603]:
train_data.columns.values

array(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name',
       'Carrier Type', 'Claim Injury Type', 'County of Injury',
       'COVID-19 Indicator', 'District Name', 'First Hearing Date',
       'Gender', 'IME-4 Count', 'Industry Code',
       'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description',
       'Zip Code', 'Agreement Reached', 'WCB Decision',
       'Number of Dependents'], dtype=object)

In [604]:
train_data.shape

(574025, 31)

In [605]:
train_data["Claim Injury Type"].unique().tolist()
# Accident date
# Age at injury (0)????
# Alternative dispute resolution (U ??)
# Attorney representative (no issues)
# avg weekly wage (didn't check)
# Birth year (0), (2018 - baby)???
# carrier names - check if carrier names are weird (like general business names - walmart??)
# Carrier type: 'UNKNOWN'/ 5D. SPECIAL FUND - UNKNOWN (no issues)
# Claim injury type (cancelled ??)
#County of injury (unknown)
# Covid19 (no issues)
# district name ("Statewide")
# Gender ("U", "X")
# IME-4 Count (no issues - outlier)
# Industry code (no issues)
# Industry code description (no issues)
# Medical Fee region (UK??)
# OIICS Nature of Injury description (no values) --> removed 
# WCIO Cause of Injury Code (no issues)
# WCIO Cause of Injury Description(no issues)
# WCIO Nature of Injury Code (no issues)
# WCIO Nature of Injury Description (no issues)
# WCIO Part Of Body Code (n issues)
# WCIO Part Of Body Description ("Insufficient info to properly classify")
# Zip Code (no issues)
# Agreement Reached (no issues)
# WCB Decision (no issues)
# Number of Dependents (no issues)

['2. NON-COMP',
 '4. TEMPORARY',
 '3. MED ONLY',
 '5. PPD SCH LOSS',
 '6. PPD NSL',
 '1. CANCELLED',
 '8. DEATH',
 '7. PTD']

#### Data types:

In [606]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 574025 entries, 5393875 to 6165075
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Accident Date                      570337 non-null  object 
 1   Age at Injury                      574025 non-null  float64
 2   Alternative Dispute Resolution     574025 non-null  object 
 3   Assembly Date                      574025 non-null  object 
 4   Attorney/Representative            574025 non-null  object 
 5   Average Weekly Wage                545374 non-null  float64
 6   Birth Year                         544947 non-null  float64
 7   C-2 Date                           559466 non-null  object 
 8   C-3 Date                           187244 non-null  object 
 9   Carrier Name                       574025 non-null  object 
 10  Carrier Type                       574025 non-null  object 
 11  Claim Injury Type                  57

In [607]:
train_data["Age at Injury"] = train_data["Age at Injury"].astype("Int64")

In [608]:
train_data["Number of Dependents"] = train_data["Number of Dependents"].astype("Int64")

In [609]:
train_data["Birth Year"] = train_data["Birth Year"].astype("Int64")

In [610]:
train_data["WCIO Part Of Body Code"] = train_data["WCIO Part Of Body Code"].astype("Int64")

In [611]:
train_data["WCIO Nature of Injury Code"] = train_data["WCIO Nature of Injury Code"].astype("Int64")

In [612]:
train_data["WCIO Cause of Injury Code"] = train_data["WCIO Cause of Injury Code"].astype("Int64")

In [613]:
train_data["Industry Code"] = train_data["Industry Code"].astype("Int64")

In [614]:
train_data["Accident Date"] = pd.to_datetime(train_data['Accident Date'], format = '%Y-%m-%d')

In [615]:
train_data["Assembly Date"] = pd.to_datetime(train_data['C-2 Date'], format = '%Y-%m-%d')

In [616]:
train_data["First Hearing Date"] = pd.to_datetime(train_data['First Hearing Date'], format = '%Y-%m-%d')

In [617]:
train_data["C-2 Date"] = pd.to_datetime(train_data['C-2 Date'], format = '%Y-%m-%d')

In [618]:
train_data["C-3 Date"] = pd.to_datetime(train_data['C-3 Date'], format = '%Y-%m-%d')

In [619]:
train_data["IME-4 Count"] = train_data["IME-4 Count"].astype("Int64")

In [620]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 574025 entries, 5393875 to 6165075
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   Accident Date                      570337 non-null  datetime64[ns]
 1   Age at Injury                      574025 non-null  Int64         
 2   Alternative Dispute Resolution     574025 non-null  object        
 3   Assembly Date                      559466 non-null  datetime64[ns]
 4   Attorney/Representative            574025 non-null  object        
 5   Average Weekly Wage                545374 non-null  float64       
 6   Birth Year                         544947 non-null  Int64         
 7   C-2 Date                           559466 non-null  datetime64[ns]
 8   C-3 Date                           187244 non-null  datetime64[ns]
 9   Carrier Name                       574025 non-null  object        
 10  Carrier Type      

In [621]:
train_data["Agreement Reached"].head()

Claim Identifier
5393875    0.0
5393091    1.0
5393889    0.0
5393887    0.0
5393863    0.0
Name: Agreement Reached, dtype: float64

In [622]:
train_data.columns.values

array(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name',
       'Carrier Type', 'Claim Injury Type', 'County of Injury',
       'COVID-19 Indicator', 'District Name', 'First Hearing Date',
       'Gender', 'IME-4 Count', 'Industry Code',
       'Industry Code Description', 'Medical Fee Region',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description',
       'Zip Code', 'Agreement Reached', 'WCB Decision',
       'Number of Dependents'], dtype=object)

#### Convert birth year into age:

In [623]:
train_data["Birth Year"]= datetime.now().year - train_data["Birth Year"]

In [624]:
train_data["Claim Injury Type"].unique().tolist()

['2. NON-COMP',
 '4. TEMPORARY',
 '3. MED ONLY',
 '5. PPD SCH LOSS',
 '6. PPD NSL',
 '1. CANCELLED',
 '8. DEATH',
 '7. PTD']

#### Dates analysis:

In [625]:
train_data["Days Between Accident_Assembly"] = train_data['Assembly Date'] - train_data['Accident Date']
print(f"maximum days between assembly: {train_data['Days Between Accident_Assembly'].max()}")
print(f"minimum days between assembly: {train_data['Days Between Accident_Assembly'].min()}") 
# strange values assembly date < accident date?? 
# #same max & min as days between accident and C2.

maximum days between assembly: 21503 days 00:00:00
minimum days between assembly: -8842 days +00:00:00


In [626]:
train_data["Days Between Accident_C2"] = train_data['C-2 Date'] - train_data['Accident Date']
print(f"maximum days between assembly: {train_data['Days Between Accident_C2'].max()}")
print(f"minimum days between assembly: {train_data['Days Between Accident_C2'].min()}") 
# strange values C2 date < accident date?? 
# #same max & min as days between accident and C2.

maximum days between assembly: 21503 days 00:00:00
minimum days between assembly: -8842 days +00:00:00


In [627]:
train_data["Days Between Accident_C3"] = train_data['C-3 Date'] - train_data['Accident Date']
print(f"maximum days between assembly: {train_data['Days Between Accident_C3'].max()}")
print(f"minimum days between assembly: {train_data['Days Between Accident_C3'].min()}")
# strange values C3 date < accident date?? 

maximum days between assembly: 18566 days 00:00:00
minimum days between assembly: -9870 days +00:00:00


In [628]:
train_data["Days Between Accident_1st_Hearing"] = train_data['First Hearing Date'] - train_data['Accident Date']
print(f"maximum days between assembly: {train_data["Days Between Accident_1st_Hearing"].max()}")
print(f"minimum days between assembly: {train_data["Days Between Accident_1st_Hearing"].min()}")
# strange values 1st_Hearing < accident date?? 

maximum days between assembly: 16373 days 00:00:00
minimum days between assembly: -576 days +00:00:00


#### Group descriptions by code: 

In [629]:
group_by_industry = train_data.groupby("Industry Code")["Industry Code Description"].unique()
group_by_industry

# Different codes with same description

Industry Code
11                      [AGRICULTURE, FORESTRY, FISHING AND HUNTING]
21                                                          [MINING]
22                                                       [UTILITIES]
23                                                    [CONSTRUCTION]
31                                                   [MANUFACTURING]
32                                                   [MANUFACTURING]
33                                                   [MANUFACTURING]
42                                                 [WHOLESALE TRADE]
44                                                    [RETAIL TRADE]
45                                                    [RETAIL TRADE]
48                                  [TRANSPORTATION AND WAREHOUSING]
49                                  [TRANSPORTATION AND WAREHOUSING]
51                                                     [INFORMATION]
52                                           [FINANCE AND INSURANCE]
53                  

In [630]:
group_by_injury = train_data.groupby("WCIO Cause of Injury Code")["WCIO Cause of Injury Description"].unique()
group_by_injury

# Different codes with same description

WCIO Cause of Injury Code
1                                       [CHEMICALS]
2                       [HOT OBJECTS OR SUBSTANCES]
3                            [TEMPERATURE EXTREMES]
4                                   [FIRE OR FLAME]
5                             [STEAM OR HOT FLUIDS]
6                    [DUST, GASES, FUMES OR VAPORS]
7                               [WELDING OPERATION]
8                                       [RADIATION]
9                               [CONTACT WITH, NOC]
10                           [MACHINE OR MACHINERY]
11                     [COLD OBJECTS OR SUBSTANCES]
12                                 [OBJECT HANDLED]
13               [CAUGHT IN, UNDER OR BETWEEN, NOC]
14                          [ABNORMAL AIR PRESSURE]
15                                   [BROKEN GLASS]
16                [HAND TOOL, UTENSIL; NOT POWERED]
17                 [OBJECT BEING LIFTED OR HANDLED]
18                   [POWERED HAND TOOL, APPLIANCE]
19                     [CUT, PUNCTURE,

In [631]:
group_by_body_part = train_data.groupby("WCIO Part Of Body Code")["WCIO Part Of Body Description"].unique()
group_by_body_part

# different codes with the same description

WCIO Part Of Body Code
-9                                                 [MULTIPLE]
10                                     [MULTIPLE HEAD INJURY]
11                                                    [SKULL]
12                                                    [BRAIN]
13                                                   [EAR(S)]
14                                                   [EYE(S)]
15                                                     [NOSE]
16                                                    [TEETH]
17                                                    [MOUTH]
18                                              [SOFT TISSUE]
19                                             [FACIAL BONES]
20                                     [MULTIPLE NECK INJURY]
21                                                [VERTEBRAE]
22                                                     [DISC]
23                                              [SPINAL CORD]
24                                             

In [632]:
# Use describe() to get the descriptive statistics for all variables for train dataset
train_data.describe(include="all").T


Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
Accident Date,570337.0,,,,2021-04-28 21:00:27.892982784,1961-09-06 00:00:00,2020-09-14 00:00:00,2021-06-27 00:00:00,2022-03-21 00:00:00,2023-09-29 00:00:00,
Age at Injury,574025.0,,,,42.114343,0.0,31.0,42.0,54.0,117.0,14.256336
Alternative Dispute Resolution,574025.0,3.0,N,571411.0,,,,,,,
Assembly Date,559466.0,,,,2021-07-16 20:11:19.865443328,1996-01-12 00:00:00,2020-11-06 00:00:00,2021-08-09 00:00:00,2022-04-26 00:00:00,2024-06-01 00:00:00,
Attorney/Representative,574025.0,2.0,N,392291.0,,,,,,,
Average Weekly Wage,545374.0,,,,491.089221,0.0,0.0,0.0,841.0,2828079.0,6092.92367
Birth Year,544947.0,,,,137.232546,6.0,35.0,47.0,59.0,2024.0,414.644789
C-2 Date,559466.0,,,,2021-07-16 20:11:19.865443328,1996-01-12 00:00:00,2020-11-06 00:00:00,2021-08-09 00:00:00,2022-04-26 00:00:00,2024-06-01 00:00:00,
C-3 Date,187244.0,,,,2021-07-26 21:32:29.772489728,1992-11-13 00:00:00,2020-10-27 00:00:00,2021-07-21 00:00:00,2022-04-20 00:00:00,2024-05-31 00:00:00,
Carrier Name,574025.0,2046.0,STATE INSURANCE FUND,111144.0,,,,,,,


#### STRANGE VALUES:

In [633]:
# Check for strange characters by NaN on train dataset:
# Strange values indicative of missing values: 
# Alternative dispute resolution (percentage)
# Birth year (0) (No issues) 
# Carrier type: 'UNKNOWN'()/ 5D. SPECIAL FUND - UNKNOWN (percentage)
# Claim injury type (cancelled ??) ------------------
# County of injury (unknown) (percentage)
# Gender ("U") (percentage)
# Medical Fee region (UK??) (percentage) 
# WCIO Part Of Body Description ("Insufficient info to properly classify") (percentage)--------------
# Zip code == 0000 (??)
# Weekly wage (0) 75% of values

#### - Check percentage of strange values in the dataset:

In [634]:
# % of UK values in medical fee region - it's significant (can't remove it)
medical_fee_count = (train_data['Medical Fee Region'] == 'UK').sum()
total_count = len(train_data)
percentage_medical_fee = (medical_fee_count / total_count) * 100
percentage_medical_fee.round(4)

5.8311

In [635]:
# % of U values in alternative dispute resolution - U Gender is significant. X gender, significant??
gender_count = (train_data['Gender'] == 'U').sum()
total_count = len(train_data)  
percentage_gender_U = (gender_count / total_count) * 100
print(f"Gender U: {percentage_gender_U.round(4)}%")

gender_count = (train_data['Gender'] == 'X').sum()
total_count = len(train_data)
percentage_gender_X = (gender_count / total_count) * 100
print(f"Gender X: {percentage_gender_X.round(4)}%")

Gender U: 0.8207%
Gender X: 0.008%


In [636]:
# % of "Insufficient info to properly classify" in WCIO Part Of Body Description - It's significant!
gender_count = (train_data['WCIO Part Of Body Description'] == 'INSUFFICIENT INFO TO PROPERLY IDENTIFY - UNCLASSIFIED').sum()
total_count = len(train_data)
percentage_gender_X = (gender_count / total_count) * 100
percentage_gender_X.round(4)

0.9252

In [637]:
# % of U values in alternative dispute resolution - significant??
alternative_dispute_resolution_count = (train_data['Alternative Dispute Resolution'] == 'U').sum()  
total_count = len(train_data)  
percentage_alternative_dispute_resolution = (alternative_dispute_resolution_count / total_count) * 100
percentage_alternative_dispute_resolution

0.0008710422019946866

In [638]:
# % of unknown values in county of injury - significant?? 
county_injury_count = (train_data['County of Injury'] == 'UNKNOWN').sum()
total_count = len(train_data)  
percentage_county_injury = (county_injury_count / total_count) * 100
percentage_county_injury

0.20835329471712905

#### Define target variable

In [652]:
x = train_data.drop('Claim Injury Type', axis = 1) # axis=1 means the operation must be done on columns
y = train_data['Claim Injury Type'] 


### Define metric & non-metric features

In [653]:
metric_features = ["Age at Injury", "Average Weekly Wage", "Birth Year", "IME-4 Count", "Number_of_Dependents", "Days Between Accident_Assembly", "Days Between Accident_C2", 
                   "Days Between Accident_C3", "Days Between Accident_1st_Hearing"]

non_metric_features = ["Industry Code", "WCIO Cause of Injury Code", "WCIO Nature of Injury Code", "WCIO Part Of Body Code", "Accident Date", "Alternative Dispute Resolution",
                       "Assembly Date", "Attorney/Representative", "C-2 Date", "C-3 Date", "Carrier Name", "Carrier Type", "County of Injury", "COVID-19 Indicator",
                       "District Name", "First Hearing Date", "Gender", "Industry Code Description", "Medical Fee Region", "WCIO Cause of Injury Description", 
                       "WCIO Nature of Injury Description", "WCIO Part Of Body Description", "Zip Code", "Agreement Reached", "WCB Decision"]



In [None]:
# handling missing values in the dataset 