## Import 

### Import Libraries

In [259]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# setting seaborn style
sns.set()

from sklearn.model_selection import train_test_split


### Import Datasets

In [260]:
train_data = pd.read_csv("project_data/train_data.csv", low_memory=False)
test_data = pd.read_csv("project_data/test_data.csv")

In [261]:
print(train_data.head())
print(test_data.head())

  Accident Date  Age at Injury Alternative Dispute Resolution Assembly Date  \
0    2019-12-30           31.0                              N    2020-01-01   
1    2019-08-30           46.0                              N    2020-01-01   
2    2019-12-06           40.0                              N    2020-01-01   
3           NaN            NaN                            NaN    2020-01-01   
4    2019-12-30           61.0                              N    2020-01-01   

  Attorney/Representative  Average Weekly Wage  Birth Year    C-2 Date  \
0                       N                 0.00      1988.0  2019-12-31   
1                       Y              1745.93      1973.0  2020-01-01   
2                       N              1434.80      1979.0  2020-01-01   
3                     NaN                  NaN         NaN         NaN   
4                       N                  NaN      1958.0  2019-12-31   

     C-3 Date                  Carrier Name  ... WCIO Cause of Injury Code  \
0 

In [262]:
train_data.columns.values

array(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name',
       'Carrier Type', 'Claim Identifier', 'Claim Injury Type',
       'County of Injury', 'COVID-19 Indicator', 'District Name',
       'First Hearing Date', 'Gender', 'IME-4 Count', 'Industry Code',
       'Industry Code Description', 'Medical Fee Region',
       'OIICS Nature of Injury Description', 'WCIO Cause of Injury Code',
       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code',
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code',
       'WCIO Part Of Body Description', 'Zip Code', 'Agreement Reached',
       'WCB Decision', 'Number of Dependents'], dtype=object)

### Descriptive analysis

In [263]:
# set "Claim Identifier" as the index for the train and test datasets
train_data = train_data.set_index("Claim Identifier")
test_data = test_data.set_index("Claim Identifier")

In [264]:
# CHECK UNIQUE VALUES FOR EACH COLUMN TO FIND STRANGE VALUES!

train_data["Number of Dependents"].unique().tolist()
# Alternative dispute resolution (U ??)
# Attorney representative (no issues)
# avg weekly wage (didnt check)
# Birth year (0)???
# carrier names - check if carrier names are weird (like general business names - walmart??)
# Carrier type: 'UNKNOWN'/ 5D. SPECIAL FUND - UNKNOWN
# Claim injury type (cancelled ??)
#County of injury (unknown)
# Covid19 (no issues)
# district name ("Statewide")
# Gender ("U", "X")
# IME-4 Count (no issues - outlier)
# Industry code (no issues)
# Industry code description (no issues)
# Medical Fee region (UK??)
# OIICS Nature of Injury description (no values)
# WCIO Cause of Injury Code (no issues)
# WCIO Cause of Injury Description(no issues)
# WCIO Nature of Injury Code (no issues)
# WCIO Nature of Injury Description (no issues)
# WCIO Part Of Body Code (n issues)
# WCIO Part Of Body Description ("Insufficient info to properly classify")
# Zip Code (no issues)
# Agreement Reached (no issues)
# WCB Decision (no issues)
# Number of Dependents (no issues)


[1.0, 4.0, 6.0, nan, 5.0, 3.0, 2.0, 0.0]

In [265]:
train_data.info()

## DATA TYPES:
# Convert accident date (to date type)
# convert age (to int)
# Alternative dispute resolution (??)
# Assembly date (to date type)
# birth year (to int)
# C2 date (to date type)
# C3 date (to date type)
# First hearing date (to date)
# IME-4 count (to int)
# Industry code (object)
# OIICS Nature of Injury Description (to object) - all nulls (relevant??)
# WCIO cause of injury code (float to object)
# WCIO Nature of Injury Code (float to object)
# WCIO Part Of Body Code (float to object)
# Agreement reached (??) - boolean/ int(???)
# Number of dependents (to int)

<class 'pandas.core.frame.DataFrame'>
Index: 593471 entries, 5393875 to 818961390
Data columns (total 32 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       570337 non-null  object 
 1   Age at Injury                       574026 non-null  float64
 2   Alternative Dispute Resolution      574026 non-null  object 
 3   Assembly Date                       593471 non-null  object 
 4   Attorney/Representative             574026 non-null  object 
 5   Average Weekly Wage                 545375 non-null  float64
 6   Birth Year                          544948 non-null  float64
 7   C-2 Date                            559466 non-null  object 
 8   C-3 Date                            187245 non-null  object 
 9   Carrier Name                        574026 non-null  object 
 10  Carrier Type                        574026 non-null  object 
 11  Claim Injury Type     

In [266]:
# Alternative dispute resolution (U ??)
# Attorney representative (no issues)
# avg weekly wage (didnt check)
# Birth year (0)???
# carrier names - check if carrier names are weird (like general business names - walmart??)
# Carrier type: 'UNKNOWN'/ 5D. SPECIAL FUND - UNKNOWN
#County of injury (unknown)
# Covid19 (no issues)
# district name ("Statewide")
# Gender ("U", "X")
# IME-4 Count (no issues - outlier)
# Industry code (no issues)
# Industry code description (no issues)
# Medical Fee region (UK??)
# OIICS Nature of Injury description (no values)
# WCIO Cause of Injury Code (no issues)
# WCIO Cause of Injury Description(no issues)
# WCIO Nature of Injury Code (no issues)
# WCIO Nature of Injury Description (no issues)
# WCIO Part Of Body Code (n issues)
# WCIO Part Of Body Description ("Insufficient info to properly classify")
# Zip Code (no issues)
# Agreement Reached (no issues)
# WCB Decision (no issues)
# Number of Dependents (no issues)



In [267]:
# Fix datatypes for training dataset:

In [268]:
# Use describe() to get the descriptive statistics for all variables for train dataset
train_data.describe(include="all").T

# Min age is 0 (?)
# max age 117 (?)
# Alternative dispute resolution ("U") (??)
# Gender values (???)
# OIICS (No values???)
# Number of dependents (not on metadata)
# Birth year (2018) baby in china (??)
# Agreement reached (binary) (to int)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Accident Date,570337.0,5539.0,2020-03-01,1245.0,,,,,,,
Age at Injury,574026.0,,,,42.11427,14.256432,0.0,31.0,42.0,54.0,117.0
Alternative Dispute Resolution,574026.0,3.0,N,571412.0,,,,,,,
Assembly Date,593471.0,1096.0,2020-03-06,1422.0,,,,,,,
Attorney/Representative,574026.0,2.0,N,392291.0,,,,,,,
Average Weekly Wage,545375.0,,,,491.088321,6092.91812,0.0,0.0,0.0,841.0,2828079.0
Birth Year,544948.0,,,,1886.767604,414.644423,0.0,1965.0,1977.0,1989.0,2018.0
C-2 Date,559466.0,2475.0,2021-05-11,1847.0,,,,,,,
C-3 Date,187245.0,1648.0,2021-04-21,350.0,,,,,,,
Carrier Name,574026.0,2046.0,STATE INSURANCE FUND,111144.0,,,,,,,


In [269]:
# Use describe() to get the descriptive statistics for all variables for test dataset
test_data.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Accident Date,385531.0,3438.0,2024-01-16,1263.0,,,,,,,
Age at Injury,387975.0,,,,41.414944,14.501056,0.0,30.0,40.0,53.0,114.0
Alternative Dispute Resolution,387975.0,3.0,N,386314.0,,,,,,,
Assembly Date,387975.0,434.0,2023-09-21,1789.0,,,,,,,
Attorney/Representative,387975.0,2.0,N,306476.0,,,,,,,
Average Weekly Wage,368771.0,,,,183.343831,3542.310214,0.0,0.0,0.0,0.0,1950317.0
Birth Year,368505.0,,,,1875.383466,444.659075,0.0,1967.0,1980.0,1992.0,2019.0
C-2 Date,378841.0,1048.0,2023-10-11,1687.0,,,,,,,
C-3 Date,85216.0,626.0,2023-10-04,341.0,,,,,,,
Carrier Name,387975.0,1598.0,STATE INSURANCE FUND,66189.0,,,,,,,


In [270]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 387975 entries, 6165911 to 6553594
Data columns (total 29 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       385531 non-null  object 
 1   Age at Injury                       387975 non-null  int64  
 2   Alternative Dispute Resolution      387975 non-null  object 
 3   Assembly Date                       387975 non-null  object 
 4   Attorney/Representative             387975 non-null  object 
 5   Average Weekly Wage                 368771 non-null  float64
 6   Birth Year                          368505 non-null  float64
 7   C-2 Date                            378841 non-null  object 
 8   C-3 Date                            85216 non-null   object 
 9   Carrier Name                        387975 non-null  object 
 10  Carrier Type                        387975 non-null  object 
 11  County of Injury        

In [271]:
# check for duplicate values on train and test dataset:
print(f"Train dataset:\n{train_data.duplicated().value_counts()}") # true: nr of duplicated rows in dataset
print(f"\nTest dataset:\n{test_data.duplicated().value_counts()}") # false: nr of unique rows in dataset

# true - nr of duplicated rows 
# false - nr of unique rows

Train dataset:
False    575121
True      18350
Name: count, dtype: int64

Test dataset:
False    387973
True          2
Name: count, dtype: int64


In [272]:
# Check for strange characters by NaN on train dataset:
# Strange values indicative of missing values: 
# Alternative dispute resolution (U??) (replaced)
# Birth year (0) (replaced) ??? ----------------
# Carrier type: 'UNKNOWN'(replaced)/ 5D. SPECIAL FUND - UNKNOWN ----------------
# Claim injury type (cancelled ??) ------------------
# County of injury (unknown) (replaced) 
# Gender ("U") (replaced)
# Medical Fee region (UK??) (replaced) 
# WCIO Part Of Body Description ("Insufficient info to properly classify") (replaced)--------------


In [273]:
# Replace unknown values by NaN on train dataset:
unknown_values_list = ["U", "UNKNOWN", "unknown",""]
train_data.replace(unknown_values_list, np.nan, inplace=True)
train_data["Medical Fee Region"].replace("UK", np.nan)
train_data["WCIO Part Of Body Description"].replace("Insufficient info to properly classify", np.nan)
train_data["Birth Year"].replace(0, np.nan)

Claim Identifier
5393875      1988.0
5393091      1973.0
5393889      1979.0
957648180       NaN
5393887      1958.0
              ...  
327160035       NaN
6165075      1950.0
249875936       NaN
120584215       NaN
818961390       NaN
Name: Birth Year, Length: 593471, dtype: float64

In [274]:
# Check for missing values on train dataset:
missing_values_train = train_data.isna().sum()
missing_values_train_percentage = (train_data.isna().mean()*100).round(2)
print("Missing values\n",missing_values_train,"\n")
print("Percentage of Missing values\n",missing_values_train_percentage)

Missing values
 Accident Date                          23134
Age at Injury                          19445
Alternative Dispute Resolution         19450
Assembly Date                              0
Attorney/Representative                19445
Average Weekly Wage                    48096
Birth Year                             48523
C-2 Date                               34005
C-3 Date                              406226
Carrier Name                           19445
Carrier Type                           21219
Claim Injury Type                      19445
County of Injury                       20642
COVID-19 Indicator                     19445
District Name                          19445
First Hearing Date                    442673
Gender                                 24156
IME-4 Count                           460668
Industry Code                          29403
Industry Code Description              29403
Medical Fee Region                     19445
OIICS Nature of Injury Description    5

In [275]:
# Replace unknown values by NaN on test dataset:
test_data.replace(unknown_values_list, np.nan, inplace=True)
train_data["Medical Fee Region"].replace("UK", np.nan)
train_data["WCIO Part Of Body Description"].replace("Insufficient info to properly classify", np.nan)
train_data["Birth Year"].replace(0, np.nan)

Claim Identifier
5393875      1988.0
5393091      1973.0
5393889      1979.0
957648180       NaN
5393887      1958.0
              ...  
327160035       NaN
6165075      1950.0
249875936       NaN
120584215       NaN
818961390       NaN
Name: Birth Year, Length: 593471, dtype: float64

In [276]:
# Check for missing values on test dataset:
missing_values = test_data.isna().sum()
missing_values_percentage = (test_data.isna().mean()*100).round(2)
print("Missing values\n",missing_values,"\n")
print("Percentage of Missing values\n",missing_values_percentage)

Missing values
 Accident Date                           2444
Age at Injury                              0
Alternative Dispute Resolution             1
Assembly Date                              0
Attorney/Representative                    0
Average Weekly Wage                    19204
Birth Year                             19470
C-2 Date                                9134
C-3 Date                              302759
Carrier Name                               0
Carrier Type                            1642
County of Injury                         915
COVID-19 Indicator                         0
District Name                              0
First Hearing Date                    344947
Gender                                  5340
IME-4 Count                           352726
Industry Code                           7736
Industry Code Description               7736
Medical Fee Region                         0
OIICS Nature of Injury Description    387975
WCIO Cause of Injury Code              

In [None]:
## GROUP BY & DATES (FINISH DESCRIPTIVE ANALYSIS & ADDRESSING MISSING VALUES)