<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTaJWG7PzF3toxaRMB1-JicpqMgJuEXATd0fg&" style="width: 150px;margin-top:30px;" align ="center">

<div style= "font-size: 40px;font-weight:bold; font-family: 'Avenir Next LT Pro', sans-serif;"><center>To Grant or Not to Grant</center></div> <br>

<div style= "font-size: 27px;font-weight:bold;line-height: 1.1; margin-top:40px; font-family: 'Avenir Next LT Pro', sans-serif;"><center>Machine Learning Project 2024/2025</center></div> <br>

   <div style= "font-size: 20px;font-weight:bold; font-family: 'Avenir Next LT Pro', sans-serif;"><center> Group 42:</center></div>
   <div><center> Eden da Silva | 20240740 </center></div>
   <div><center> José Cavaco | 20240513 </center></div>
   <div><center> Luana Rocha | 20240111 </center></div>
   <div><center> Maria Radix | 20240687 </center></div>
   <div><center> Tiago Castilho | 20240489 </center></div>

# Notebook 4: Encoding and Standardization
# Index

* [4.0. Imports](#imp)
* [4.1. Encoding](#enc)
  * [4.1.1 Splitting into numerical and categorical features](#split)
  * [4.1.2 One Hot Encoding](#ohc)
  * [4.1.3 Frequency Encoding](#freq)
* [4.2. Scaling](#scaling)
  * [4.2.1. MinMaxScaler](#minmax)
  * [4.2.2. Standard Scaling](#standard)
* [4.3 CSV Formatting, Exports](#export)

<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="imp">
    
# **4.0. Imports**
 
</a>

In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')


In [5]:
#importing the data frames from the previous notebook (03_feature_engineering)
df_train_fe = pd.read_csv('df_train_fe.csv',low_memory=False, index_col = 0)
df_val_fe = pd.read_csv('df_val_fe.csv',low_memory=False, index_col = 0)
df_test_fe = pd.read_csv('df_test_fe.csv',low_memory=False, index_col = 0)

df_train_fe_log = pd.read_csv('df_train_fe_log.csv',low_memory=False, index_col = 0)
df_val_fe_log = pd.read_csv('df_val_fe_log.csv',low_memory=False, index_col = 0)
df_test_fe_log = pd.read_csv('df_test_fe_log.csv',low_memory=False, index_col = 0)

In [6]:
df_train_stand = df_train_fe.copy()
df_val_stand = df_val_fe.copy()
df_test_stand = df_test_fe.copy()

#for dfs with log transformed features
df_train_stand_log = df_train_fe_log.copy()
df_val_stand_log = df_val_fe_log.copy()
df_test_stand_log = df_test_fe_log.copy()

In [7]:
#splitting X and y
X_train = df_train_stand.drop('Claim Injury Type', axis = 1)
X_val = df_val_stand.drop('Claim Injury Type', axis = 1)
y_train = df_train_stand['Claim Injury Type']
y_val = df_val_stand['Claim Injury Type']

#for log transformed
X_train_log = df_train_stand_log.drop('Claim Injury Type', axis = 1)
X_val_log = df_val_stand_log.drop('Claim Injury Type', axis = 1)

In [8]:
y_train

Claim Identifier
5785935    4
6090033    3
6136197    2
6019545    2
5792247    3
          ..
5837651    4
5781926    4
5890060    4
6148528    2
6027959    2
Name: Claim Injury Type, Length: 459181, dtype: int64

In [9]:
X_train.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,2021-08-05,49.0,0,2021-08-10,1,744.06,1971.0,2021-08-10 00:00:00.000000000,1,PROPERTY AND CASUALTY,...,2.0,5,5,Senior,Summer,0,36458.94,21883,21888,21888
6090033,2022-09-14,56.0,0,2022-09-21,1,845.29,1966.0,2022-09-21 00:00:00.000000000,1,EVEREST PREMIER INSURANCE,...,6.0,7,7,Senior,Fall,0,47336.24,22288,22295,22295
6136197,2022-11-07,48.0,0,2022-11-18,0,1070.12,1974.0,2022-11-18 00:00:00.000000000,0,CHARTER OAK FIRE INS CO,...,1.0,11,11,Senior,Fall,0,51365.76,22342,22353,22353
6019545,2020-09-02,55.0,0,2022-06-21,0,1070.12,1965.0,2022-06-21 00:00:00.000000000,1,NEW HAMPSHIRE INSURANCE CO,...,3.0,657,657,Senior,Fall,0,58856.6,21546,22203,22203
5792247,2021-08-13,53.0,0,2021-08-18,1,668.08,1968.0,2021-08-18 00:00:00.000000000,1,STATE INSURANCE FUND,...,6.0,5,5,Senior,Summer,0,35408.24,21891,21896,21896


In [10]:
X_val.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,2022-03-14,51.0,0,2022-03-22,0,1070.12,1970.0,2022-03-22 00:00:00.000000000,1,CANTON CENTRAL SCHOOL DISTRICT,...,7.0,8,8,Senior,Spring,0,54576.12,22104,22112,22112
6150876,2022-12-02,61.0,0,2022-12-09,1,1070.12,1961.0,2022-12-09 00:00:00.000000000,0,STATE INSURANCE FUND,...,2.0,7,7,Senior,Winter,0,65277.32,22367,22374,22374
5397365,2011-06-14,50.0,0,2020-01-08,1,1070.12,1961.0,2020-01-07 00:00:00.000000000,0,COMMERCE AND INDUSTRY INS CO,...,4.0,3130,3129,Senior,Summer,0,53506.0,18178,21308,21307
6077399,2022-02-07,54.0,0,2022-09-07,1,1502.5,1967.0,2022-09-13 00:00:00.000000000,1,STATE INSURANCE FUND,...,7.0,212,218,Senior,Winter,0,81135.0,22069,22281,22287
5945251,2022-03-15,35.0,0,2022-03-16,0,1070.12,1986.0,2022-03-16 00:00:00.000000000,0,SAFETY NATIONAL CASUALTY CORP,...,2.0,1,1,Adult,Spring,0,37454.2,22105,22106,22106


In [11]:
X_train.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,2021-08-05,49.0,0,2021-08-10,1,744.06,1971.0,2021-08-10 00:00:00.000000000,1,PROPERTY AND CASUALTY,...,2.0,5,5,Senior,Summer,0,36458.94,21883,21888,21888
6090033,2022-09-14,56.0,0,2022-09-21,1,845.29,1966.0,2022-09-21 00:00:00.000000000,1,EVEREST PREMIER INSURANCE,...,6.0,7,7,Senior,Fall,0,47336.24,22288,22295,22295
6136197,2022-11-07,48.0,0,2022-11-18,0,1070.12,1974.0,2022-11-18 00:00:00.000000000,0,CHARTER OAK FIRE INS CO,...,1.0,11,11,Senior,Fall,0,51365.76,22342,22353,22353
6019545,2020-09-02,55.0,0,2022-06-21,0,1070.12,1965.0,2022-06-21 00:00:00.000000000,1,NEW HAMPSHIRE INSURANCE CO,...,3.0,657,657,Senior,Fall,0,58856.6,21546,22203,22203
5792247,2021-08-13,53.0,0,2021-08-18,1,668.08,1968.0,2021-08-18 00:00:00.000000000,1,STATE INSURANCE FUND,...,6.0,5,5,Senior,Summer,0,35408.24,21891,21896,21896


In [12]:
X_val.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,2022-03-14,51.0,0,2022-03-22,0,1070.12,1970.0,2022-03-22 00:00:00.000000000,1,CANTON CENTRAL SCHOOL DISTRICT,...,7.0,8,8,Senior,Spring,0,54576.12,22104,22112,22112
6150876,2022-12-02,61.0,0,2022-12-09,1,1070.12,1961.0,2022-12-09 00:00:00.000000000,0,STATE INSURANCE FUND,...,2.0,7,7,Senior,Winter,0,65277.32,22367,22374,22374
5397365,2011-06-14,50.0,0,2020-01-08,1,1070.12,1961.0,2020-01-07 00:00:00.000000000,0,COMMERCE AND INDUSTRY INS CO,...,4.0,3130,3129,Senior,Summer,0,53506.0,18178,21308,21307
6077399,2022-02-07,54.0,0,2022-09-07,1,1502.5,1967.0,2022-09-13 00:00:00.000000000,1,STATE INSURANCE FUND,...,7.0,212,218,Senior,Winter,0,81135.0,22069,22281,22287
5945251,2022-03-15,35.0,0,2022-03-16,0,1070.12,1986.0,2022-03-16 00:00:00.000000000,0,SAFETY NATIONAL CASUALTY CORP,...,2.0,1,1,Adult,Spring,0,37454.2,22105,22106,22106


In [13]:
df_test_stand.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,2022-12-24,19,0,2023-01-02,0,1070.12,2003.0,2023-01-02 00:00:00.000000000,0,INDEMNITY INSURANCE CO OF,...,4.0,9,9,Teen,Winter,1,20332.28,22389,22398,22398
6166141,2022-11-20,19,0,2023-01-02,0,1070.12,2003.0,2023-01-02 00:00:00.000000000,0,A I U INSURANCE COMPANY,...,7.0,43,43,Teen,Fall,1,20332.28,22355,22398,22398
6165907,2022-12-26,59,0,2023-01-02,0,1070.12,1963.0,2022-12-31 00:00:00.000000000,0,AMGUARD INSURANCE COMPANY,...,6.0,7,5,Senior,Winter,0,63137.08,22391,22398,22396
6166047,2022-12-28,55,0,2023-01-02,0,1070.12,1967.0,2023-01-02 00:00:00.000000000,0,INDEMNITY INS. OF N AMERICA,...,4.0,5,5,Senior,Winter,0,58856.6,22393,22398,22398
6166102,2022-12-20,25,0,2023-01-02,0,1070.12,1997.0,2022-12-31 00:00:00.000000000,0,NEW HAMPSHIRE INSURANCE CO,...,3.0,13,11,Teen,Winter,0,26753.0,22385,22398,22396


In [14]:
X_train_log.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,...,Age Group,Accident Season,Accident Date_IsWeekend,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Average Weekly Wage,Log Age_Wage_interaction,Log Accident to Assembly Days,Log Accident to C-2 Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,2021-08-05,49.0,0,2021-08-10,1,1971.0,2021-08-10 00:00:00.000000000,1,PROPERTY AND CASUALTY,1A. PRIVATE,...,Senior,Summer,0,21883,21888,21888,6.613465,10.503969,1.79176,1.79176
6090033,2022-09-14,56.0,0,2022-09-21,1,1966.0,2022-09-21 00:00:00.000000000,1,EVEREST PREMIER INSURANCE,1A. PRIVATE,...,Senior,Fall,0,22288,22295,22295,6.740862,10.765053,2.079442,2.079442
6136197,2022-11-07,48.0,0,2022-11-18,0,1974.0,2022-11-18 00:00:00.000000000,0,CHARTER OAK FIRE INS CO,1A. PRIVATE,...,Senior,Fall,0,22342,22353,22353,6.97646,10.846747,2.484907,2.484907
6019545,2020-09-02,55.0,0,2022-06-21,0,1965.0,2022-06-21 00:00:00.000000000,1,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,...,Senior,Fall,0,21546,22203,22203,6.97646,10.982876,6.489205,6.489205
5792247,2021-08-13,53.0,0,2021-08-18,1,1968.0,2021-08-18 00:00:00.000000000,1,STATE INSURANCE FUND,2A. SIF,...,Senior,Summer,0,21891,21896,21896,6.505904,10.474728,1.79176,1.79176


In [15]:
X_val_log.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,...,Age Group,Accident Season,Accident Date_IsWeekend,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Average Weekly Wage,Log Age_Wage_interaction,Log Accident to Assembly Days,Log Accident to C-2 Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,2022-03-14,51.0,0,2022-03-22,0,1970.0,2022-03-22 00:00:00.000000000,1,CANTON CENTRAL SCHOOL DISTRICT,3A. SELF PUBLIC,...,Senior,Spring,0,22104,22112,22112,6.97646,10.90737,2.197225,2.197225
6150876,2022-12-02,61.0,0,2022-12-09,1,1961.0,2022-12-09 00:00:00.000000000,0,STATE INSURANCE FUND,2A. SIF,...,Senior,Winter,0,22367,22374,22374,6.97646,11.086415,2.079442,2.079442
5397365,2011-06-14,50.0,0,2020-01-08,1,1961.0,2020-01-07 00:00:00.000000000,0,COMMERCE AND INDUSTRY INS CO,1A. PRIVATE,...,Senior,Summer,0,18178,21308,21307,6.97646,10.887568,8.049108,8.048788
6077399,2022-02-07,54.0,0,2022-09-07,1,1967.0,2022-09-13 00:00:00.000000000,1,STATE INSURANCE FUND,2A. SIF,...,Senior,Winter,0,22069,22281,22287,7.315551,11.303882,5.361292,5.389072
5945251,2022-03-15,35.0,0,2022-03-16,0,1986.0,2022-03-16 00:00:00.000000000,0,SAFETY NATIONAL CASUALTY CORP,1A. PRIVATE,...,Adult,Spring,0,22105,22106,22106,6.97646,10.530901,0.693148,0.693148


In [16]:
df_test_stand_log.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,...,Age Group,Accident Season,Accident Date_IsWeekend,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Average Weekly Wage,Log Age_Wage_interaction,Log Accident to Assembly Days,Log Accident to C-2 Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,2022-12-24,19,0,2023-01-02,0,2003.0,2023-01-02 00:00:00.000000000,0,INDEMNITY INSURANCE CO OF,1A. PRIVATE,...,Teen,Winter,1,22389,22398,22398,6.97646,9.920014,2.302585,2.302585
6166141,2022-11-20,19,0,2023-01-02,0,2003.0,2023-01-02 00:00:00.000000000,0,A I U INSURANCE COMPANY,1A. PRIVATE,...,Teen,Fall,1,22355,22398,22398,6.97646,9.920014,3.78419,3.78419
6165907,2022-12-26,59,0,2023-01-02,0,1963.0,2022-12-31 00:00:00.000000000,0,AMGUARD INSURANCE COMPANY,1A. PRIVATE,...,Senior,Winter,0,22391,22398,22396,6.97646,11.053079,2.079442,1.79176
6166047,2022-12-28,55,0,2023-01-02,0,1967.0,2023-01-02 00:00:00.000000000,0,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,...,Senior,Winter,0,22393,22398,22398,6.97646,10.982876,1.79176,1.79176
6166102,2022-12-20,25,0,2023-01-02,0,1997.0,2022-12-31 00:00:00.000000000,0,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,...,Teen,Winter,0,22385,22398,22396,6.97646,10.194439,2.639057,2.484907


<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="enc">
    
# **4.1. Encoding**
 
</a>

<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="split">
    
## 4.1.1 Splitting into numerical and categorical features
 
</a>

Since we created new features such as 'Accident Date in Days','Assembly Date in Days' and 'C-2 Date in Days', we wil drop the original features because the pairs of features are redundant, and features in days give us more insights and are easier to work for our models.

In [20]:
#dropping 'Accident Date', 'Assembly Date', 'C-2 Date'
X_train.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)
X_val.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)
df_test_stand.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)

X_train_log.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)
X_val_log.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)
df_test_stand_log.drop(['Accident Date', 'Assembly Date', 'C-2 Date'], axis=1, inplace=True)

In [21]:
#checking to see if we really dropped it
X_train.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,49.0,0,1,744.06,1971.0,1,PROPERTY AND CASUALTY,1A. PRIVATE,QUEENS,0,...,2.0,5,5,Senior,Summer,0,36458.94,21883,21888,21888
6090033,56.0,0,1,845.29,1966.0,1,EVEREST PREMIER INSURANCE,1A. PRIVATE,QUEENS,0,...,6.0,7,7,Senior,Fall,0,47336.24,22288,22295,22295
6136197,48.0,0,0,1070.12,1974.0,0,CHARTER OAK FIRE INS CO,1A. PRIVATE,ROCKLAND,0,...,1.0,11,11,Senior,Fall,0,51365.76,22342,22353,22353
6019545,55.0,0,0,1070.12,1965.0,1,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,WESTCHESTER,0,...,3.0,657,657,Senior,Fall,0,58856.6,21546,22203,22203
5792247,53.0,0,1,668.08,1968.0,1,STATE INSURANCE FUND,2A. SIF,ALBANY,0,...,6.0,5,5,Senior,Summer,0,35408.24,21891,21896,21896


In [22]:
X_val.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,51.0,0,0,1070.12,1970.0,1,CANTON CENTRAL SCHOOL DISTRICT,3A. SELF PUBLIC,ST. LAWRENCE,0,...,7.0,8,8,Senior,Spring,0,54576.12,22104,22112,22112
6150876,61.0,0,1,1070.12,1961.0,0,STATE INSURANCE FUND,2A. SIF,GENESEE,0,...,2.0,7,7,Senior,Winter,0,65277.32,22367,22374,22374
5397365,50.0,0,1,1070.12,1961.0,0,COMMERCE AND INDUSTRY INS CO,1A. PRIVATE,SUFFOLK,0,...,4.0,3130,3129,Senior,Summer,0,53506.0,18178,21308,21307
6077399,54.0,0,1,1502.5,1967.0,1,STATE INSURANCE FUND,2A. SIF,ULSTER,0,...,7.0,212,218,Senior,Winter,0,81135.0,22069,22281,22287
5945251,35.0,0,0,1070.12,1986.0,0,SAFETY NATIONAL CASUALTY CORP,1A. PRIVATE,KINGS,0,...,2.0,1,1,Adult,Spring,0,37454.2,22105,22106,22106


In [23]:
df_test_stand.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,...,Part of Body,Accident to Assembly Days,Accident to C-2 Days,Age Group,Accident Season,Accident Date_IsWeekend,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,19,0,0,1070.12,2003.0,0,INDEMNITY INSURANCE CO OF,1A. PRIVATE,BRONX,0,...,4.0,9,9,Teen,Winter,1,20332.28,22389,22398,22398
6166141,19,0,0,1070.12,2003.0,0,A I U INSURANCE COMPANY,1A. PRIVATE,QUEENS,0,...,7.0,43,43,Teen,Fall,1,20332.28,22355,22398,22398
6165907,59,0,0,1070.12,1963.0,0,AMGUARD INSURANCE COMPANY,1A. PRIVATE,WESTCHESTER,0,...,6.0,7,5,Senior,Winter,0,63137.08,22391,22398,22396
6166047,55,0,0,1070.12,1967.0,0,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,QUEENS,0,...,4.0,5,5,Senior,Winter,0,58856.6,22393,22398,22398
6166102,25,0,0,1070.12,1997.0,0,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,KINGS,0,...,3.0,13,11,Teen,Winter,0,26753.0,22385,22398,22396


In [24]:
X_train_log.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,District Name,...,Age Group,Accident Season,Accident Date_IsWeekend,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Average Weekly Wage,Log Age_Wage_interaction,Log Accident to Assembly Days,Log Accident to C-2 Days
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,49.0,0,1,1971.0,1,PROPERTY AND CASUALTY,1A. PRIVATE,QUEENS,0,NYC,...,Senior,Summer,0,21883,21888,21888,6.613465,10.503969,1.79176,1.79176
6090033,56.0,0,1,1966.0,1,EVEREST PREMIER INSURANCE,1A. PRIVATE,QUEENS,0,NYC,...,Senior,Fall,0,22288,22295,22295,6.740862,10.765053,2.079442,2.079442
6136197,48.0,0,0,1974.0,0,CHARTER OAK FIRE INS CO,1A. PRIVATE,ROCKLAND,0,NYC,...,Senior,Fall,0,22342,22353,22353,6.97646,10.846747,2.484907,2.484907
6019545,55.0,0,0,1965.0,1,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,WESTCHESTER,0,NYC,...,Senior,Fall,0,21546,22203,22203,6.97646,10.982876,6.489205,6.489205
5792247,53.0,0,1,1968.0,1,STATE INSURANCE FUND,2A. SIF,ALBANY,0,ALBANY,...,Senior,Summer,0,21891,21896,21896,6.505904,10.474728,1.79176,1.79176


In [25]:
# Get the columns that are in X_val but not in df_test_stand
columns_in_X_val_not_in_df_test_stand = set(X_val.columns) - set(df_test_stand.columns)

# Get the columns that are in df_test_stand but not in X_val
columns_in_df_test_stand_not_in_X_val = set(df_test_stand.columns) - set(X_val.columns)

print(columns_in_X_val_not_in_df_test_stand)
print(columns_in_df_test_stand_not_in_X_val)

set()
set()


These are the results we expect, as we've dropped WBC Decision and Agreement Reached in previous notebooks. Our dataframes should now have identical structure moving forward.

Now we need to split the features into categorical and numerical features.

In [28]:
#Defining the metric and non metric features

metric_features = [
    "Age at Injury", "Average Weekly Wage", "Birth Year", 
    "IME-4 Count", "Number of Dependents",
    "Accident to Assembly Days", "Accident to C-2 Days",
    "Accident Date in Days", "Assembly Date in Days",
    "C-2 Date in Days", "Age_Wage_interaction"]


non_metric_features = [
    "Alternative Dispute Resolution", "Attorney/Representative", "C-3 Date", "Carrier Name",
    "Carrier Type", "County of Injury", "COVID-19 Indicator", "District Name", "Industry Code"
    "Gender", "Medical Fee Region", "WCIO Cause of Injury Code",
    "WCIO Nature of Injury Code", "WCIO Part Of Body Code",
    "Cause of Injury", "Nature of Injury", "Part of Body", "Age Group",
    "Accident Season", "Accident Date_IsWeekend", "First Hearing Date", "Non-Zero Wage"]


Now we can do the encoding for our features, so we can make better use of these features to our models.

We decided to do the following encoding:  

### One Hot Encoding
- 'Carrier Type'
- 'District Name'
- 'Gender'
- 'Medical Fee Region',
- 'Cause of Injury'
- 'Nature of Injury'
- 'Part of Body'
- 'Age Group'
- 'Accident Season'

### Frequency Encoding
- 'Carrier Name'
- 'County of Injury'
- 'WCIO Cause of Injury Code'
- 'WCIO Nature of Injury Code'
- 'WCIO Part Of Body Code'
- 'Industry Code'

We chose features with fewer categories to use One-Hot encoding, as using this encoding method on features with lots of categories results in high dimensionality. For those with higher numbers of unique values, we have elected to use Frequency Encoding.

<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="ohc">
    
# 4.1.2 One Hot Encoding
 
</a>

In [32]:
# Defining the features for One Hot Encoding
onehot_feats = ['Carrier Type', 'District Name', 'Gender', 'Medical Fee Region',
    'Cause of Injury', 'Nature of Injury', 'Part of Body', 'Age Group', 'Accident Season']

In [33]:
# OneHotEncoder for X_train
ohc = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")  
ohc_feat_train = ohc.fit_transform(X_train[onehot_feats])  
ohc_feat_names = ohc.get_feature_names_out(onehot_feats)  
ohc_df = pd.DataFrame(ohc_feat_train, index=X_train.index, columns=ohc_feat_names)  

# Display the result
ohc_df.head()

Unnamed: 0_level_0,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,District Name_BINGHAMTON,District Name_BUFFALO,District Name_HAUPPAUGE,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6090033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6136197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6019545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5792247,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [34]:
#X_val 
ohc_feat_val = ohc.transform(X_val[onehot_feats]) 
ohc_df_val = pd.DataFrame(ohc_feat_val, index=X_val.index, columns=ohc_feat_names)  

#display results
ohc_df_val.head()

Unnamed: 0_level_0,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,District Name_BINGHAMTON,District Name_BUFFALO,District Name_HAUPPAUGE,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6150876,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5397365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6077399,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5945251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [35]:
#df_test_stand
ohc_feat_test = ohc.transform(df_test_stand[onehot_feats])
ohc_df_test = pd.DataFrame(ohc_feat_test, index=df_test_stand.index, columns=ohc.get_feature_names_out(onehot_feats))

#display results
ohc_df_test.columns

Index(['Carrier Type_2A. SIF', 'Carrier Type_3A. SELF PUBLIC',
       'Carrier Type_4A. SELF PRIVATE',
       'Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A)',
       'Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS',
       'Carrier Type_5D. SPECIAL FUND - UNKNOWN', 'Carrier Type_UNKNOWN',
       'District Name_BINGHAMTON', 'District Name_BUFFALO',
       'District Name_HAUPPAUGE', 'District Name_NYC',
       'District Name_ROCHESTER', 'District Name_STATEWIDE',
       'District Name_SYRACUSE', 'Gender_M', 'Gender_U', 'Gender_X',
       'Medical Fee Region_1', 'Medical Fee Region_2', 'Medical Fee Region_3',
       'Medical Fee Region_4', 'Cause of Injury_2.0', 'Cause of Injury_3.0',
       'Cause of Injury_4.0', 'Cause of Injury_5.0', 'Cause of Injury_6.0',
       'Cause of Injury_7.0', 'Cause of Injury_8.0', 'Cause of Injury_9.0',
       'Cause of Injury_10.0', 'Cause of Injury_11.0', 'Nature of Injury_2.0',
       'Nature of Injury_3.0', 'Nature of Injury_4.0', 'Na

In [36]:
#repeating the process for our log-transformed data

ohc_log = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")  
ohc_feat_log = ohc.fit_transform(X_train_log[onehot_feats])  
ohc_feat_names_log = ohc.get_feature_names_out(onehot_feats)  
ohc_df_log = pd.DataFrame(ohc_feat_log, index=X_train_log.index, columns=ohc_feat_names_log)  

# Display the result
ohc_df_log.head()

Unnamed: 0_level_0,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,District Name_BINGHAMTON,District Name_BUFFALO,District Name_HAUPPAUGE,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6090033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6136197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6019545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5792247,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [37]:
#now for X_val_log
ohc_feat_val_log = ohc.transform(X_val_log[onehot_feats]) 
ohc_df_val_log = pd.DataFrame(ohc_feat_val_log, index=X_val_log.index, columns=ohc_feat_names_log) 

#display results
ohc_df_val_log.head()

Unnamed: 0_level_0,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,District Name_BINGHAMTON,District Name_BUFFALO,District Name_HAUPPAUGE,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6150876,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5397365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6077399,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5945251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
#and for df_test_stand_log
ohc_feat_test_log = ohc.transform(df_test_stand_log[onehot_feats])
ohc_df_test_log = pd.DataFrame(ohc_feat_test_log, index=df_test_stand_log.index, columns=ohc.get_feature_names_out(onehot_feats))

#display results
ohc_df_test_log.head()


Unnamed: 0_level_0,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,District Name_BINGHAMTON,District Name_BUFFALO,District Name_HAUPPAUGE,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6166141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6165907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="freq">
    
# 4.1.3 Frequency Encoding
 
</a>

We chose the following features for frequency encoding for two reasons. Firstly, because they have too many unique values to apply One-Hot encoding. Doing so would result in extremely high dimensionality and sparsity in our data that could negatively effect our models. Secondly, we suppose a relationship may exist betweeen how frequently (or infrequently) the values in the features occur and the extremity of their associated claim injury type. For example, some of the most uncommon Parts of Body or Causes of Injury may lead to the most extreme claim types, which are also rare in our dataset. 

In [41]:
# List of features for frequency encoding
frequency_features = ['Carrier Name', 'County of Injury', 'WCIO Cause of Injury Code',
                      'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Industry Code']

# Frequency Encoding for X_train
for feature in frequency_features:
    freq_encoding_train = X_train[feature].map(X_train[feature].value_counts())
    X_train[feature + '_freq'] = freq_encoding_train

# Display the result
X_train.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,...,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,49.0,0,1,744.06,1971.0,1,PROPERTY AND CASUALTY,1A. PRIVATE,QUEENS,0,...,36458.94,21883,21888,21888,1766,46629,23432,28947,41394,8092
6090033,56.0,0,1,845.29,1966.0,1,EVEREST PREMIER INSURANCE,1A. PRIVATE,QUEENS,0,...,47336.24,22288,22295,22295,379,46629,13355,122775,4779,29870
6136197,48.0,0,0,1070.12,1974.0,0,CHARTER OAK FIRE INS CO,1A. PRIVATE,ROCKLAND,0,...,51365.76,22342,22353,22353,13681,7307,8696,88332,1587,29870
6019545,55.0,0,0,1070.12,1965.0,1,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,WESTCHESTER,0,...,58856.6,21546,22203,22203,10220,20627,26496,2241,15052,24710
5792247,53.0,0,1,668.08,1968.0,1,STATE INSURANCE FUND,2A. SIF,ALBANY,0,...,35408.24,21891,21896,21896,88785,10946,23432,88332,10996,34843


In [42]:
# Frequency Encoding for X_val
for feature in frequency_features:
    freq_encoding_val = X_val[feature].map(X_train[feature].value_counts())  # Using X_train frequencies
    freq_encoding_val = freq_encoding_val.fillna(X_train[feature].value_counts().mean()) #filling any unseen categories with mean
    X_val[feature + '_freq'] = freq_encoding_val

X_val.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,...,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,51.0,0,0,1070.12,1970.0,1,CANTON CENTRAL SCHOOL DISTRICT,3A. SELF PUBLIC,ST. LAWRENCE,0,...,54576.12,22104,22112,22112,3.0,2654,9611,88332,33567,35509
6150876,61.0,0,1,1070.12,1961.0,0,STATE INSURANCE FUND,2A. SIF,GENESEE,0,...,65277.32,22367,22374,22374,88785.0,2180,37239,122775,30192,29870
5397365,50.0,0,1,1070.12,1961.0,0,COMMERCE AND INDUSTRY INS CO,1A. PRIVATE,SUFFOLK,0,...,53506.0,18178,21308,21307,101.0,48372,9611,44556,19942,96677
6077399,54.0,0,1,1502.5,1967.0,1,STATE INSURANCE FUND,2A. SIF,ULSTER,0,...,81135.0,22069,22281,22287,88785.0,4141,9646,88332,33567,96677
5945251,35.0,0,0,1070.12,1986.0,0,SAFETY NATIONAL CASUALTY CORP,1A. PRIVATE,KINGS,0,...,37454.2,22105,22106,22106,11119.0,42675,26496,122775,6505,96677


In [43]:
# Frequency Encoding for df_test_stand
for feature in frequency_features:
    freq_encoding_test = df_test_stand[feature].map(X_train[feature].value_counts())  # Using X_train frequencies
    freq_encoding_test = freq_encoding_test.fillna(X_train[feature].value_counts().mean()) #filling any unseen categories with mean
    df_test_stand[feature + '_freq'] = freq_encoding_test

df_test_stand.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,...,Age_Wage_interaction,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,19,0,0,1070.12,2003.0,0,INDEMNITY INSURANCE CO OF,1A. PRIVATE,BRONX,0,...,20332.28,22389,22398,22398,7250.0,31653,25473,88332,9297,29870
6166141,19,0,0,1070.12,2003.0,0,A I U INSURANCE COMPANY,1A. PRIVATE,QUEENS,0,...,20332.28,22355,22398,22398,8878.0,46629,14617,88332,6330,14494
6165907,59,0,0,1070.12,1963.0,0,AMGUARD INSURANCE COMPANY,1A. PRIVATE,WESTCHESTER,0,...,63137.08,22391,22398,22396,151.0,20627,8696,44556,929,16950
6166047,55,0,0,1070.12,1967.0,0,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,QUEENS,0,...,58856.6,22393,22398,22398,11545.0,46629,9611,88332,38455,29870
6166102,25,0,0,1070.12,1997.0,0,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,KINGS,0,...,26753.0,22385,22398,22396,10220.0,42675,9934,37606,9962,287


In [44]:
#repeating all frequency encoding for log-transformed dfs
for feature in frequency_features:
    freq_encoding_train_log = X_train_log[feature].map(X_train_log[feature].value_counts())
    X_train_log[feature + '_freq'] = freq_encoding_train_log

# Display the result
X_train_log.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,District Name,...,Log Average Weekly Wage,Log Age_Wage_interaction,Log Accident to Assembly Days,Log Accident to C-2 Days,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,49.0,0,1,1971.0,1,PROPERTY AND CASUALTY,1A. PRIVATE,QUEENS,0,NYC,...,6.613465,10.503969,1.79176,1.79176,1766,46629,23432,28947,41394,8092
6090033,56.0,0,1,1966.0,1,EVEREST PREMIER INSURANCE,1A. PRIVATE,QUEENS,0,NYC,...,6.740862,10.765053,2.079442,2.079442,379,46629,13355,122775,4779,29870
6136197,48.0,0,0,1974.0,0,CHARTER OAK FIRE INS CO,1A. PRIVATE,ROCKLAND,0,NYC,...,6.97646,10.846747,2.484907,2.484907,13681,7307,8696,88332,1587,29870
6019545,55.0,0,0,1965.0,1,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,WESTCHESTER,0,NYC,...,6.97646,10.982876,6.489205,6.489205,10220,20627,26496,2241,15052,24710
5792247,53.0,0,1,1968.0,1,STATE INSURANCE FUND,2A. SIF,ALBANY,0,ALBANY,...,6.505904,10.474728,1.79176,1.79176,88785,10946,23432,88332,10996,34843


In [45]:
# Frequency Encoding for X_val
for feature in frequency_features:
    freq_encoding_val_log = X_val_log[feature].map(X_train_log[feature].value_counts())  # Using X_train frequencies
    freq_encoding_val_log = freq_encoding_val_log.fillna(X_train_log[feature].value_counts().mean()) #filling any unseen categories with mean
    X_val_log[feature + '_freq'] = freq_encoding_val_log

X_val_log.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,District Name,...,Log Average Weekly Wage,Log Age_Wage_interaction,Log Accident to Assembly Days,Log Accident to C-2 Days,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,51.0,0,0,1970.0,1,CANTON CENTRAL SCHOOL DISTRICT,3A. SELF PUBLIC,ST. LAWRENCE,0,SYRACUSE,...,6.97646,10.90737,2.197225,2.197225,3.0,2654,9611,88332,33567,35509
6150876,61.0,0,1,1961.0,0,STATE INSURANCE FUND,2A. SIF,GENESEE,0,ROCHESTER,...,6.97646,11.086415,2.079442,2.079442,88785.0,2180,37239,122775,30192,29870
5397365,50.0,0,1,1961.0,0,COMMERCE AND INDUSTRY INS CO,1A. PRIVATE,SUFFOLK,0,HAUPPAUGE,...,6.97646,10.887568,8.049108,8.048788,101.0,48372,9611,44556,19942,96677
6077399,54.0,0,1,1967.0,1,STATE INSURANCE FUND,2A. SIF,ULSTER,0,NYC,...,7.315551,11.303882,5.361292,5.389072,88785.0,4141,9646,88332,33567,96677
5945251,35.0,0,0,1986.0,0,SAFETY NATIONAL CASUALTY CORP,1A. PRIVATE,KINGS,0,NYC,...,6.97646,10.530901,0.693148,0.693148,11119.0,42675,26496,122775,6505,96677


In [46]:
# Frequency Encoding for df_test_stand
for feature in frequency_features:
    freq_encoding_test_log = df_test_stand_log[feature].map(X_train_log[feature].value_counts())  # Using X_train frequencies
    freq_encoding_test_log = freq_encoding_test_log.fillna(X_train_log[feature].value_counts().mean()) #filling any unseen categories with mean
    df_test_stand_log[feature + '_freq'] = freq_encoding_test_log

df_test_stand_log.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,Carrier Name,Carrier Type,County of Injury,COVID-19 Indicator,District Name,...,Log Average Weekly Wage,Log Age_Wage_interaction,Log Accident to Assembly Days,Log Accident to C-2 Days,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,19,0,0,2003.0,0,INDEMNITY INSURANCE CO OF,1A. PRIVATE,BRONX,0,NYC,...,6.97646,9.920014,2.302585,2.302585,7250.0,31653,25473,88332,9297,29870
6166141,19,0,0,2003.0,0,A I U INSURANCE COMPANY,1A. PRIVATE,QUEENS,0,NYC,...,6.97646,9.920014,3.78419,3.78419,8878.0,46629,14617,88332,6330,14494
6165907,59,0,0,1963.0,0,AMGUARD INSURANCE COMPANY,1A. PRIVATE,WESTCHESTER,0,NYC,...,6.97646,11.053079,2.079442,1.79176,151.0,20627,8696,44556,929,16950
6166047,55,0,0,1967.0,0,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,QUEENS,0,NYC,...,6.97646,10.982876,1.79176,1.79176,11545.0,46629,9611,88332,38455,29870
6166102,25,0,0,1997.0,0,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,KINGS,0,NYC,...,6.97646,10.194439,2.639057,2.484907,10220.0,42675,9934,37606,9962,287


In [47]:
#dropping un-encoded features from each data frame
X_train.drop(frequency_features, axis = 1, inplace = True)
X_val.drop(frequency_features, axis = 1, inplace = True)
df_test_stand.drop(frequency_features, axis = 1, inplace = True)

#for log dfs as well
X_train_log.drop(frequency_features, axis = 1, inplace = True)
X_val_log.drop(frequency_features, axis = 1, inplace = True)
df_test_stand_log.drop(frequency_features, axis = 1, inplace = True)

In [48]:
#checking to makes sure columns dropped correctly
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459181 entries, 5785935 to 6027959
Data columns (total 33 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Age at Injury                    459181 non-null  float64
 1   Alternative Dispute Resolution   459181 non-null  int64  
 2   Attorney/Representative          459181 non-null  int64  
 3   Average Weekly Wage              459181 non-null  float64
 4   Birth Year                       459181 non-null  float64
 5   C-3 Date                         459181 non-null  int64  
 6   Carrier Type                     459181 non-null  object 
 7   COVID-19 Indicator               459181 non-null  int64  
 8   District Name                    459181 non-null  object 
 9   First Hearing Date               459181 non-null  int64  
 10  Gender                           459181 non-null  object 
 11  IME-4 Count                      459181 non-null  float64
 12  

In [49]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114796 entries, 5947783 to 6076050
Data columns (total 33 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Age at Injury                    114796 non-null  float64
 1   Alternative Dispute Resolution   114796 non-null  int64  
 2   Attorney/Representative          114796 non-null  int64  
 3   Average Weekly Wage              114796 non-null  float64
 4   Birth Year                       114796 non-null  float64
 5   C-3 Date                         114796 non-null  int64  
 6   Carrier Type                     114796 non-null  object 
 7   COVID-19 Indicator               114796 non-null  int64  
 8   District Name                    114796 non-null  object 
 9   First Hearing Date               114796 non-null  int64  
 10  Gender                           114796 non-null  object 
 11  IME-4 Count                      114796 non-null  float64
 12  

In [50]:
df_test_stand.info()

<class 'pandas.core.frame.DataFrame'>
Index: 387975 entries, 6165911 to 6553594
Data columns (total 33 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Age at Injury                    387975 non-null  int64  
 1   Alternative Dispute Resolution   387975 non-null  int64  
 2   Attorney/Representative          387975 non-null  int64  
 3   Average Weekly Wage              387975 non-null  float64
 4   Birth Year                       387975 non-null  float64
 5   C-3 Date                         387975 non-null  int64  
 6   Carrier Type                     387975 non-null  object 
 7   COVID-19 Indicator               387975 non-null  int64  
 8   District Name                    387975 non-null  object 
 9   First Hearing Date               387975 non-null  int64  
 10  Gender                           387975 non-null  object 
 11  IME-4 Count                      387975 non-null  float64
 12  

In [51]:
#now joining One-Hot encoded columns with other columns so we have one fully encoded dataframe
X_train = pd.concat([X_train, ohc_df], axis=1)
X_val = pd.concat([X_val, ohc_df_val], axis = 1)
df_test_stand = pd.concat([df_test_stand, ohc_df_test], axis = 1)

#repeating for log-transformed data
X_train_log = pd.concat([X_train_log, ohc_df_log], axis=1)
X_val_log = pd.concat([X_val_log, ohc_df_val_log], axis=1)
df_test_stand_log = pd.concat([df_test_stand_log, ohc_df_test_log], axis=1)

In [52]:
#dropping non-One-Hot encoded features from dfs
X_train.drop(onehot_feats, axis = 1, inplace = True)
X_val.drop(onehot_feats, axis = 1, inplace = True)
df_test_stand.drop(onehot_feats, axis = 1, inplace = True)

#for log dfs as well
X_train_log.drop(onehot_feats, axis = 1, inplace = True)
X_val_log.drop(onehot_feats, axis = 1, inplace = True)
df_test_stand_log.drop(onehot_feats, axis = 1, inplace = True)

In [53]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459181 entries, 5785935 to 6027959
Data columns (total 82 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Age at Injury                                             459181 non-null  float64
 1   Alternative Dispute Resolution                            459181 non-null  int64  
 2   Attorney/Representative                                   459181 non-null  int64  
 3   Average Weekly Wage                                       459181 non-null  float64
 4   Birth Year                                                459181 non-null  float64
 5   C-3 Date                                                  459181 non-null  int64  
 6   COVID-19 Indicator                                        459181 non-null  int64  
 7   First Hearing Date                                        459181 non-null  int64  
 8   IM

In [54]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114796 entries, 5947783 to 6076050
Data columns (total 82 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Age at Injury                                             114796 non-null  float64
 1   Alternative Dispute Resolution                            114796 non-null  int64  
 2   Attorney/Representative                                   114796 non-null  int64  
 3   Average Weekly Wage                                       114796 non-null  float64
 4   Birth Year                                                114796 non-null  float64
 5   C-3 Date                                                  114796 non-null  int64  
 6   COVID-19 Indicator                                        114796 non-null  int64  
 7   First Hearing Date                                        114796 non-null  int64  
 8   IM

In [55]:
df_test_stand.info()

<class 'pandas.core.frame.DataFrame'>
Index: 387975 entries, 6165911 to 6553594
Data columns (total 82 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Age at Injury                                             387975 non-null  int64  
 1   Alternative Dispute Resolution                            387975 non-null  int64  
 2   Attorney/Representative                                   387975 non-null  int64  
 3   Average Weekly Wage                                       387975 non-null  float64
 4   Birth Year                                                387975 non-null  float64
 5   C-3 Date                                                  387975 non-null  int64  
 6   COVID-19 Indicator                                        387975 non-null  int64  
 7   First Hearing Date                                        387975 non-null  int64  
 8   IM

In [56]:
X_train_log.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459181 entries, 5785935 to 6027959
Data columns (total 82 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Age at Injury                                             459181 non-null  float64
 1   Alternative Dispute Resolution                            459181 non-null  int64  
 2   Attorney/Representative                                   459181 non-null  int64  
 3   Birth Year                                                459181 non-null  float64
 4   C-3 Date                                                  459181 non-null  int64  
 5   COVID-19 Indicator                                        459181 non-null  int64  
 6   First Hearing Date                                        459181 non-null  int64  
 7   IME-4 Count                                               459181 non-null  float64
 8   Nu

In [57]:
X_val_log.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114796 entries, 5947783 to 6076050
Data columns (total 82 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Age at Injury                                             114796 non-null  float64
 1   Alternative Dispute Resolution                            114796 non-null  int64  
 2   Attorney/Representative                                   114796 non-null  int64  
 3   Birth Year                                                114796 non-null  float64
 4   C-3 Date                                                  114796 non-null  int64  
 5   COVID-19 Indicator                                        114796 non-null  int64  
 6   First Hearing Date                                        114796 non-null  int64  
 7   IME-4 Count                                               114796 non-null  float64
 8   Nu

In [58]:
df_test_stand_log.info()

<class 'pandas.core.frame.DataFrame'>
Index: 387975 entries, 6165911 to 6553594
Data columns (total 82 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Age at Injury                                             387975 non-null  int64  
 1   Alternative Dispute Resolution                            387975 non-null  int64  
 2   Attorney/Representative                                   387975 non-null  int64  
 3   Birth Year                                                387975 non-null  float64
 4   C-3 Date                                                  387975 non-null  int64  
 5   COVID-19 Indicator                                        387975 non-null  int64  
 6   First Hearing Date                                        387975 non-null  int64  
 7   IME-4 Count                                               387975 non-null  float64
 8   Nu

We now have all datasets with fully numerical datatypes. We can proceed with scaling.

<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="scaling">
    
# 4.2 Scaling
 
</a>

We now have encoded all categorical features into numerical datatypes. We will include the features we used Frequency Encoding for in our scaling. The One-Hot encoded features do not need to be scaled, as they are already either 0 or 1. Below we specify the columns to be scaled.

In [62]:
feats_to_scale = ["Age at Injury",
                  "Average Weekly Wage",
                  "Birth Year", 
                  "IME-4 Count",
                  "Number of Dependents",
                  "Accident to Assembly Days",
                  "Accident to C-2 Days",
                  "Accident Date in Days",
                  "Assembly Date in Days",
                  "C-2 Date in Days",
                  "Age_Wage_interaction",
                  'Carrier Name_freq',
                  'County of Injury_freq',
                  'WCIO Cause of Injury Code_freq',
                  'WCIO Nature of Injury Code_freq',
                  'WCIO Part Of Body Code_freq',
                  'Industry Code_freq']

feats_to_scale_log = ["Age at Injury",
                  "Log Average Weekly Wage",
                  "Birth Year", 
                  "IME-4 Count",
                  "Number of Dependents",
                  "Log Accident to Assembly Days",
                  "Log Accident to C-2 Days",
                  "Accident Date in Days",
                  "Assembly Date in Days",
                  "C-2 Date in Days",
                  "Log Age_Wage_interaction",
                  'Carrier Name_freq',
                  'County of Injury_freq',
                  'WCIO Cause of Injury Code_freq',
                  'WCIO Nature of Injury Code_freq',
                  'WCIO Part Of Body Code_freq',
                  'Industry Code_freq']

In [63]:
#selecting the subset of data that needs to be scaled
X_train_scale = X_train[feats_to_scale]
X_val_scale = X_val[feats_to_scale]
df_test_stand_scale = df_test_stand[feats_to_scale]

#doing the same for log-transformed data
X_train_log_scale = X_train_log[feats_to_scale_log]
X_val_log_scale = X_val_log[feats_to_scale_log]
df_test_stand_log_scale = df_test_stand_log[feats_to_scale_log]

<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="minmax">
    
## 4.2.1 MinMaxScaler
 
</a>

MinMax Scaler also works for categorical features, example some ordinal categorical features...

In [66]:
scaler = MinMaxScaler()

# Fitting the scaler to the training data
scaler.fit(X_train_scale)

# Transforming the training data
X_train_scale_minmax = scaler.transform(X_train_scale)
X_train_scale_minmax = pd.DataFrame(X_train_scale_minmax, columns=X_train_scale.columns).set_index(X_train.index)

# Transforming the validation data (X_val)
X_val_scale = X_val_scale[X_train_scale.columns]  # Ensure the columns in X_val are aligned with X_train
X_val_scale_minmax = scaler.transform(X_val_scale)
X_val_scale_minmax = pd.DataFrame(X_val_scale_minmax, columns=X_val_scale.columns).set_index(X_val.index)

# Transforming the test data (df_test_stand_num)
df_test_stand_scale = df_test_stand_scale[X_train_scale.columns]  # Ensure the columns are aligned
df_test_stand_scale_minmax = scaler.transform(df_test_stand_scale)
df_test_stand_scale_minmax = pd.DataFrame(df_test_stand_scale_minmax, columns=df_test_stand_scale.columns).set_index(df_test_stand.index)

In [67]:
X_train_scale_minmax.head()

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Accident to Assembly Days,Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5785935,0.523077,0.104511,0.5,0.057143,1.0,0.000595,0.000586,0.91137,0.431935,0.666233,0.064553,0.01988,0.963886,0.629123,0.235767,1.0,0.080973
6090033,0.630769,0.119001,0.430556,0.2,0.833333,0.000833,0.00082,0.95817,0.73142,0.798634,0.083988,0.004258,0.963886,0.35844,1.0,0.114575,0.306909
6136197,0.507692,0.151184,0.541667,0.0,0.333333,0.00131,0.001289,0.96441,0.774099,0.817502,0.091188,0.154082,0.149159,0.233292,0.71946,0.037385,0.306909
6019545,0.615385,0.151184,0.416667,0.0,0.333333,0.078224,0.077004,0.872429,0.663723,0.768705,0.104572,0.1151,0.425141,0.711427,0.018245,0.362997,0.253377
5792247,0.584615,0.093635,0.458333,0.0,1.0,0.000595,0.000586,0.912295,0.437822,0.668835,0.062675,1.0,0.224557,0.629123,0.71946,0.264914,0.358502


In [68]:
X_val_scale_minmax.head()

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Accident to Assembly Days,Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5947783,0.553846,0.151184,0.486111,0.0,0.333333,0.000952,0.000938,0.936908,0.596762,0.739102,0.096924,2.3e-05,0.052752,0.25787,0.71946,0.810727,0.365411
6150876,0.707692,0.151184,0.361111,0.028571,0.333333,0.000833,0.00082,0.967298,0.789551,0.824333,0.116044,1.0,0.042931,1.0,1.0,0.729113,0.306909
5397365,0.538462,0.151184,0.361111,0.0,0.166667,0.372663,0.366737,0.483245,0.005151,0.477228,0.095012,0.001126,1.0,0.25787,0.362903,0.481247,1.0
6077399,0.6,0.213076,0.444444,0.142857,0.0,0.025241,0.025551,0.932863,0.721118,0.796031,0.144379,1.0,0.083561,0.258811,0.71946,0.810727,1.0
5945251,0.307692,0.151184,0.708333,0.0,0.0,0.000119,0.000117,0.937023,0.592347,0.73715,0.066331,0.125225,0.881962,0.711427,1.0,0.156313,1.0


In [69]:
df_test_stand_scale_minmax.head()

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Accident to Assembly Days,Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6165911,0.061538,0.151184,0.944444,0.0,0.166667,0.001072,0.001055,0.969841,0.807211,0.832141,0.035738,0.081648,0.653593,0.683948,0.71946,0.223829,0.306909
6166141,0.061538,0.151184,0.944444,0.0,0.166667,0.00512,0.00504,0.965912,0.807211,0.832141,0.035738,0.099984,0.963886,0.392339,0.71946,0.152081,0.147391
6165907,0.676923,0.151184,0.388889,0.0,0.0,0.000833,0.000586,0.970072,0.807211,0.83149,0.11222,0.001689,0.425141,0.233292,0.362903,0.021474,0.172871
6166047,0.615385,0.151184,0.444444,0.0,1.0,0.000595,0.000586,0.970303,0.807211,0.832141,0.104572,0.130023,0.963886,0.25787,0.71946,0.928929,0.306909
6166102,0.153846,0.151184,0.861111,0.0,0.833333,0.001548,0.001289,0.969378,0.807211,0.83149,0.04721,0.1151,0.881962,0.266547,0.306294,0.23991,0.0


In [70]:
# Fitting the scaler to the training data
scaler.fit(X_train_log_scale)

# Transforming the training data
X_train_log_scale_minmax = scaler.transform(X_train_log_scale)
X_train_log_scale_minmax = pd.DataFrame(X_train_log_scale_minmax, columns=X_train_log_scale.columns).set_index(X_train_log.index)

# Transforming the validation data (X_val)
X_val_log_scale = X_val_log_scale[X_train_log_scale.columns]  # Ensure the columns in X_val are aligned with X_train
X_val_log_scale_minmax = scaler.transform(X_val_log_scale)
X_val_log_scale_minmax = pd.DataFrame(X_val_log_scale_minmax, columns=X_val_log_scale.columns).set_index(X_val_log.index)

# Transforming the test data (df_test_stand_num)
df_test_stand_log_scale = df_test_stand_log_scale[X_train_log_scale.columns]  # Ensure the columns are aligned
df_test_stand_log_scale_minmax = scaler.transform(df_test_stand_log_scale)
df_test_stand_log_scale_minmax = pd.DataFrame(df_test_stand_log_scale_minmax, columns=df_test_stand_log_scale.columns).set_index(df_test_stand_log.index)

In [71]:
X_train_log_scale_minmax.head()

Unnamed: 0_level_0,Age at Injury,Log Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Log Accident to Assembly Days,Log Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5785935,0.523077,0.635703,0.5,0.057143,1.0,0.198291,0.197947,0.91137,0.431935,0.666233,0.632398,0.01988,0.963886,0.629123,0.235767,1.0,0.080973
6090033,0.630769,0.656418,0.430556,0.2,0.833333,0.230129,0.229729,0.95817,0.73142,0.798634,0.667531,0.004258,0.963886,0.35844,1.0,0.114575,0.306909
6136197,0.507692,0.694729,0.541667,0.0,0.333333,0.275001,0.274524,0.96441,0.774099,0.817502,0.678524,0.154082,0.149159,0.233292,0.71946,0.037385,0.306909
6019545,0.615385,0.694729,0.416667,0.0,0.333333,0.718151,0.716905,0.872429,0.663723,0.768705,0.696843,0.1151,0.425141,0.711427,0.018245,0.362997,0.253377
5792247,0.584615,0.618212,0.458333,0.0,1.0,0.198291,0.197947,0.912295,0.437822,0.668835,0.628463,1.0,0.224557,0.629123,0.71946,0.264914,0.358502


In [72]:
X_val_log_scale_minmax.head()

Unnamed: 0_level_0,Age at Injury,Log Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Log Accident to Assembly Days,Log Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5947783,0.553846,0.694729,0.486111,0.0,0.333333,0.243164,0.242742,0.936908,0.596762,0.739102,0.686682,2.3e-05,0.052752,0.25787,0.71946,0.810727,0.365411
6150876,0.707692,0.694729,0.361111,0.028571,0.333333,0.230129,0.229729,0.967298,0.789551,0.824333,0.710776,1.0,0.042931,1.0,1.0,0.729113,0.306909
5397365,0.538462,0.694729,0.361111,0.0,0.166667,0.890783,0.889202,0.483245,0.005151,0.477228,0.684018,0.001126,1.0,0.25787,0.362903,0.481247,1.0
6077399,0.6,0.749867,0.444444,0.142857,0.0,0.593327,0.595366,0.932863,0.721118,0.796031,0.74004,1.0,0.083561,0.258811,0.71946,0.810727,1.0
5945251,0.307692,0.694729,0.708333,0.0,0.0,0.07671,0.076576,0.937023,0.592347,0.73715,0.636022,0.125225,0.881962,0.711427,1.0,0.156313,1.0


In [73]:
df_test_stand_log_scale_minmax.head()

Unnamed: 0_level_0,Age at Injury,Log Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Log Accident to Assembly Days,Log Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6165911,0.061538,0.694729,0.944444,0.0,0.166667,0.254824,0.254382,0.969841,0.807211,0.832141,0.553816,0.081648,0.653593,0.683948,0.71946,0.223829,0.306909
6166141,0.061538,0.694729,0.944444,0.0,0.166667,0.418791,0.418064,0.965912,0.807211,0.832141,0.553816,0.099984,0.963886,0.392339,0.71946,0.152081,0.147391
6165907,0.676923,0.694729,0.388889,0.0,0.0,0.230129,0.197947,0.970072,0.807211,0.83149,0.70629,0.001689,0.425141,0.233292,0.362903,0.021474,0.172871
6166047,0.615385,0.694729,0.444444,0.0,1.0,0.198291,0.197947,0.970303,0.807211,0.832141,0.696843,0.130023,0.963886,0.25787,0.71946,0.928929,0.306909
6166102,0.153846,0.694729,0.861111,0.0,0.833333,0.292061,0.274524,0.969378,0.807211,0.83149,0.590745,0.1151,0.881962,0.266547,0.306294,0.23991,0.0


<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="standard">
    
## 4.2.2 Standard Scaling
 
</a>

In [75]:
scaler2 = StandardScaler()

# Fitting the scaler to the training data
scaler2.fit(X_train_scale)

# Transforming the training data
X_train_scale_standard = scaler2.transform(X_train_scale)
X_train_scale_standard = pd.DataFrame(X_train_scale_standard, columns=X_train_scale.columns).set_index(X_train.index)

# Transforming the validation data (X_val)
X_val_scale = X_val_scale[X_train_scale.columns]  # Ensure the columns in X_val are aligned with X_train
X_val_scale_standard = scaler2.transform(X_val_scale)
X_val_scale_standard = pd.DataFrame(X_val_scale_standard, columns=X_val_scale.columns).set_index(X_val.index)

# Transforming the test data (df_test_stand_num)
df_test_stand_scale = df_test_stand_scale[X_train_scale.columns]  # Ensure the columns are aligned
df_test_stand_scale_standard = scaler2.transform(df_test_stand_scale)
df_test_stand_scale_standard = pd.DataFrame(df_test_stand_scale_standard, columns=df_test_stand_scale.columns).set_index(df_test_stand.index)

In [76]:
X_train_scale_standard.head()

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Accident to Assembly Days,Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5785935,0.470015,-0.849727,-0.495021,0.654832,1.496936,-0.140736,-0.154959,0.152809,0.063685,0.052653,-0.442276,-0.574925,1.278489,0.700865,-0.734805,1.619014,-1.115865
6090033,0.981947,-0.623731,-0.857547,3.257614,0.997089,-0.136899,-0.150888,0.815637,1.358342,1.295032,-0.041366,-0.61633,1.278489,-0.255519,1.332373,-1.194773,-0.475842
6136197,0.396882,-0.121798,-0.277506,-0.386281,-0.50245,-0.129224,-0.142745,0.904014,1.542838,1.472078,0.107152,-0.219231,-1.071316,-0.697693,0.573539,-1.440072,-0.475842
6019545,0.908814,-0.121798,-0.930052,-0.386281,-0.50245,1.110281,1.172337,-0.39873,1.065692,1.014199,0.383245,-0.322551,-0.275339,0.991661,-1.32318,-0.405314,-0.627487
5792247,0.762548,-1.019353,-0.712536,-0.386281,1.496936,-0.140736,-0.154959,0.165902,0.089133,0.077073,-0.481002,2.022815,-0.853856,0.700865,0.573539,-0.717009,-0.329693


In [77]:
X_val_scale_standard.head()

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Accident to Assembly Days,Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5947783,0.616281,-0.121798,-0.567526,-0.386281,-0.50245,-0.13498,-0.148852,0.5145,0.776223,0.736419,0.225478,-0.627555,-1.34937,-0.610852,0.573539,1.017525,-0.31012
6150876,1.347612,-0.121798,-1.220073,0.134276,-0.50245,-0.136899,-0.150888,0.944929,1.609639,1.536181,0.619897,2.022815,-1.377695,2.011253,1.332373,0.758164,-0.475842
5397365,0.543148,-0.121798,-1.220073,-0.386281,-1.002297,5.85532,6.204664,-5.910842,-1.781281,-1.720865,0.186036,-0.624629,1.382648,-0.610852,-0.390914,-0.029528,1.487517
6077399,0.835681,0.843491,-0.785042,2.216502,-1.502144,0.256443,0.278651,0.457219,1.313808,1.270612,1.204371,2.022815,-1.26051,-0.607531,0.573539,1.017525,1.487517
5945251,-0.553848,-0.121798,0.592556,-0.386281,-1.502144,-0.148411,-0.163102,0.516137,0.757138,0.718104,-0.405593,-0.295714,1.042206,0.991661,1.332373,-1.062133,1.487517


In [78]:
df_test_stand_scale_standard.head()

Unnamed: 0_level_0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Accident to Assembly Days,Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6165911,-1.723978,-0.121798,1.825144,-0.386281,-1.002297,-0.133061,-0.146817,0.980935,1.685982,1.609442,-1.036664,-0.411213,0.383553,0.894571,0.573539,-0.847574,-0.475842
6166141,-1.723978,-0.121798,1.825144,-0.386281,-1.002297,-0.067824,-0.077602,0.92529,1.685982,1.609442,-1.036664,-0.362613,1.278489,-0.135745,0.573539,-1.075582,-0.92772
6165907,1.201346,-0.121798,-1.075062,-0.386281,-1.502144,-0.136899,-0.154959,0.984208,1.685982,1.603337,0.541013,-0.623136,-0.275339,-0.697693,-0.390914,-1.490638,-0.855541
6166047,0.908814,-0.121798,-0.785042,-0.386281,1.496936,-0.140736,-0.154959,0.987481,1.685982,1.609442,0.383245,-0.282996,1.278489,-0.610852,0.573539,1.393158,-0.475842
6166102,-1.285179,-0.121798,1.390113,-0.386281,0.997089,-0.125386,-0.142745,0.974388,1.685982,1.603337,-0.800013,-0.322551,1.042206,-0.580197,-0.544034,-0.79647,-1.345242


In [79]:
# Fitting the scaler to the training data
scaler2.fit(X_train_log_scale)

# Transforming the training data
X_train_log_scale_standard = scaler2.transform(X_train_log_scale)
X_train_log_scale_standard = pd.DataFrame(X_train_log_scale_standard, columns=X_train_log_scale.columns).set_index(X_train_log.index)

# Transforming the validation data (X_val)
X_val_log_scale = X_val_log_scale[X_train_log_scale.columns]  # Ensure the columns in X_val are aligned with X_train
X_val_log_scale_standard = scaler2.transform(X_val_log_scale)
X_val_log_scale_standard = pd.DataFrame(X_val_log_scale_standard, columns=X_val_log_scale.columns).set_index(X_val_log.index)

# Transforming the test data (df_test_stand_num)
df_test_stand_log_scale = df_test_stand_log_scale[X_train_log_scale.columns]  # Ensure the columns are aligned
df_test_stand_log_scale_standard = scaler2.transform(df_test_stand_log_scale)
df_test_stand_log_scale_standard = pd.DataFrame(df_test_stand_log_scale_standard, columns=df_test_stand_log_scale.columns).set_index(df_test_stand_log.index)

In [80]:
X_train_log_scale_standard.head()

Unnamed: 0_level_0,Age at Injury,Log Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Log Accident to Assembly Days,Log Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5785935,0.470015,-0.932094,-0.495021,0.654832,1.496936,-0.651156,-0.651252,0.152809,0.063685,0.052653,-0.283667,-0.574925,1.278489,0.700865,-0.734805,1.619014,-1.115865
6090033,0.981947,-0.58988,-0.857547,3.257614,0.997089,-0.431379,-0.445582,0.815637,1.358342,1.295032,0.208633,-0.61633,1.278489,-0.255519,1.332373,-1.194773,-0.475842
6136197,0.396882,0.042982,-0.277506,-0.386281,-0.50245,-0.121621,-0.155706,0.904014,1.542838,1.472078,0.362676,-0.219231,-1.071316,-0.697693,0.573539,-1.440072,-0.475842
6019545,0.908814,0.042982,-0.930052,-0.386281,-0.50245,2.937493,2.707055,-0.39873,1.065692,1.014199,0.619363,-0.322551,-0.275339,0.991661,-1.32318,-0.405314,-0.627487
5792247,0.762548,-1.221024,-0.712536,-0.386281,1.496936,-0.651156,-0.651252,0.165902,0.089133,0.077073,-0.338805,2.022815,-0.853856,0.700865,0.573539,-0.717009,-0.329693


In [81]:
X_val_log_scale_standard.head()

Unnamed: 0_level_0,Age at Injury,Log Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Log Accident to Assembly Days,Log Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5947783,0.616281,0.042982,-0.567526,-0.386281,-0.50245,-0.341398,-0.361376,0.5145,0.776223,0.736419,0.476988,-0.627555,-1.34937,-0.610852,0.573539,1.017525,-0.31012
6150876,1.347612,0.042982,-1.220073,0.134276,-0.50245,-0.431379,-0.445582,0.944929,1.609639,1.536181,0.814597,2.022815,-1.377695,2.011253,1.332373,0.758164,-0.475842
5397365,0.543148,0.042982,-1.220073,-0.386281,-1.002297,4.129192,3.822036,-5.910842,-1.781281,-1.720865,0.439649,-0.624629,1.382648,-0.610852,-0.390914,-0.029528,1.487517
6077399,0.835681,0.953847,-0.785042,2.216502,-1.502144,2.075815,1.920546,0.457219,1.313808,1.270612,1.224653,2.022815,-1.26051,-0.607531,0.573539,1.017525,1.487517
5945251,-0.553848,0.042982,0.592556,-0.386281,-1.502144,-1.490449,-1.436674,0.516137,0.757138,0.718104,-0.232885,-0.295714,1.042206,0.991661,1.332373,-1.062133,1.487517


In [82]:
df_test_stand_log_scale_standard.head()

Unnamed: 0_level_0,Age at Injury,Log Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Log Accident to Assembly Days,Log Accident to C-2 Days,Accident Date in Days,Assembly Date in Days,C-2 Date in Days,Log Age_Wage_interaction,Carrier Name_freq,County of Injury_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Industry Code_freq
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6165911,-1.723978,0.042982,1.825144,-0.386281,-1.002297,-0.260907,-0.286052,0.980935,1.685982,1.609442,-1.384777,-0.411213,0.383553,0.894571,0.573539,-0.847574,-0.475842
6166141,-1.723978,0.042982,1.825144,-0.386281,-1.002297,0.870976,0.77318,0.92529,1.685982,1.609442,-1.384777,-0.362613,1.278489,-0.135745,0.573539,-1.075582,-0.92772
6165907,1.201346,0.042982,-1.075062,-0.386281,-1.502144,-0.431379,-0.651252,0.984208,1.685982,1.603337,0.751738,-0.623136,-0.275339,-0.697693,-0.390914,-1.490638,-0.855541
6166047,0.908814,0.042982,-0.785042,-0.386281,1.496936,-0.651156,-0.651252,0.987481,1.685982,1.609442,0.619363,-0.282996,1.278489,-0.610852,0.573539,1.393158,-0.475842
6166102,-1.285179,0.042982,1.390113,-0.386281,0.997089,-0.003857,-0.155706,0.974388,1.685982,1.603337,-0.867319,-0.322551,1.042206,-0.580197,-0.544034,-0.79647,-1.345242


We will now replace our un-scaled features with our scaled features in the original dataframes.

In [84]:
#starting with MinMax scaled features
#original
X_train_minmax = X_train.copy()
X_train_minmax[X_train_scale_minmax.columns] = X_train_scale_minmax

X_val_minmax = X_val.copy()
X_val_minmax[X_val_scale_minmax.columns] = X_val_scale_minmax

df_test_stand_minmax = df_test_stand.copy()
df_test_stand_minmax[df_test_stand_scale_minmax.columns] = df_test_stand_scale_minmax

#log-transformed
X_train_log_minmax = X_train_log.copy()
X_train_log_minmax[X_train_log_scale_minmax.columns] = X_train_log_scale_minmax

X_val_log_minmax = X_val_log.copy()
X_val_log_minmax[X_val_log_scale_minmax.columns] = X_val_log_scale_minmax

df_test_stand_log_minmax = df_test_stand_log.copy()
df_test_stand_log_minmax[df_test_stand_log_scale_minmax.columns] = df_test_stand_log_scale_minmax


In [85]:
X_train_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.523077,0,1,0.104511,0.5,1,0,0,0.057143,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6090033,0.630769,0,1,0.119001,0.430556,1,0,1,0.2,0.833333,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6136197,0.507692,0,0,0.151184,0.541667,0,0,0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6019545,0.615385,0,0,0.151184,0.416667,1,0,0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5792247,0.584615,0,1,0.093635,0.458333,1,0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [86]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459181 entries, 5785935 to 6027959
Data columns (total 82 columns):
 #   Column                                                    Non-Null Count   Dtype  
---  ------                                                    --------------   -----  
 0   Age at Injury                                             459181 non-null  float64
 1   Alternative Dispute Resolution                            459181 non-null  int64  
 2   Attorney/Representative                                   459181 non-null  int64  
 3   Average Weekly Wage                                       459181 non-null  float64
 4   Birth Year                                                459181 non-null  float64
 5   C-3 Date                                                  459181 non-null  int64  
 6   COVID-19 Indicator                                        459181 non-null  int64  
 7   First Hearing Date                                        459181 non-null  int64  
 8   IM

In [87]:
X_val_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.553846,0,0,0.151184,0.486111,1,0,0,0.0,0.333333,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6150876,0.707692,0,1,0.151184,0.361111,0,0,1,0.028571,0.333333,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5397365,0.538462,0,1,0.151184,0.361111,0,0,0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6077399,0.6,0,1,0.213076,0.444444,1,0,1,0.142857,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5945251,0.307692,0,0,0.151184,0.708333,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [88]:
df_test_stand_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,0.061538,0,0,0.151184,0.944444,0,0,0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6166141,0.061538,0,0,0.151184,0.944444,0,0,0,0.0,0.166667,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6165907,0.676923,0,0,0.151184,0.388889,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166047,0.615385,0,0,0.151184,0.444444,0,0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166102,0.153846,0,0,0.151184,0.861111,0,0,0,0.0,0.833333,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [89]:
X_train_log_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,Non-Zero Wage,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.523077,0,1,0.5,1,0,0,0.057143,1.0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6090033,0.630769,0,1,0.430556,1,0,1,0.2,0.833333,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6136197,0.507692,0,0,0.541667,0,0,0,0.0,0.333333,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6019545,0.615385,0,0,0.416667,1,0,0,0.0,0.333333,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5792247,0.584615,0,1,0.458333,1,0,1,0.0,1.0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [90]:
X_val_log_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,Non-Zero Wage,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.553846,0,0,0.486111,1,0,0,0.0,0.333333,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6150876,0.707692,0,1,0.361111,0,0,1,0.028571,0.333333,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5397365,0.538462,0,1,0.361111,0,0,0,0.0,0.166667,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6077399,0.6,0,1,0.444444,1,0,1,0.142857,0.0,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5945251,0.307692,0,0,0.708333,0,0,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [91]:
df_test_stand_log_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,Non-Zero Wage,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,0.061538,0,0,0.944444,0,0,0,0.0,0.166667,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6166141,0.061538,0,0,0.944444,0,0,0,0.0,0.166667,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6165907,0.676923,0,0,0.388889,0,0,0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166047,0.615385,0,0,0.444444,0,0,0,0.0,1.0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166102,0.153846,0,0,0.861111,0,0,0,0.0,0.833333,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [92]:
#and again for standard scaler
#original
X_train_standard = X_train.copy()
X_train_standard[X_train_scale_standard.columns] = X_train_scale_standard

X_val_standard = X_val.copy()
X_val_standard[X_val_scale_standard.columns] = X_val_scale_standard

df_test_stand_standard = df_test_stand.copy()
df_test_stand_standard[df_test_stand_scale_standard.columns] = df_test_stand_scale_standard

#log-transformed
X_train_log_standard = X_train_log.copy()
X_train_log_standard[X_train_log_scale_standard.columns] = X_train_log_scale_standard

X_val_log_standard = X_val_log.copy()
X_val_log_standard[X_val_log_scale_standard.columns] = X_val_log_scale_standard

df_test_stand_log_standard = df_test_stand_log.copy()
df_test_stand_log_standard[df_test_stand_log_scale_standard.columns] = df_test_stand_log_scale_standard

In [93]:
X_train_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.470015,0,1,-0.849727,-0.495021,1,0,0,0.654832,1.496936,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6090033,0.981947,0,1,-0.623731,-0.857547,1,0,1,3.257614,0.997089,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6136197,0.396882,0,0,-0.121798,-0.277506,0,0,0,-0.386281,-0.50245,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6019545,0.908814,0,0,-0.121798,-0.930052,1,0,0,-0.386281,-0.50245,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5792247,0.762548,0,1,-1.019353,-0.712536,1,0,1,-0.386281,1.496936,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [94]:
X_val_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.616281,0,0,-0.121798,-0.567526,1,0,0,-0.386281,-0.50245,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6150876,1.347612,0,1,-0.121798,-1.220073,0,0,1,0.134276,-0.50245,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5397365,0.543148,0,1,-0.121798,-1.220073,0,0,0,-0.386281,-1.002297,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6077399,0.835681,0,1,0.843491,-0.785042,1,0,1,2.216502,-1.502144,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5945251,-0.553848,0,0,-0.121798,0.592556,0,0,0,-0.386281,-1.502144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [95]:
df_test_stand_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,-1.723978,0,0,-0.121798,1.825144,0,0,0,-0.386281,-1.002297,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6166141,-1.723978,0,0,-0.121798,1.825144,0,0,0,-0.386281,-1.002297,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6165907,1.201346,0,0,-0.121798,-1.075062,0,0,0,-0.386281,-1.502144,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166047,0.908814,0,0,-0.121798,-0.785042,0,0,0,-0.386281,1.496936,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166102,-1.285179,0,0,-0.121798,1.390113,0,0,0,-0.386281,0.997089,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [96]:
X_train_log_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,Non-Zero Wage,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.470015,0,1,-0.495021,1,0,0,0.654832,1.496936,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6090033,0.981947,0,1,-0.857547,1,0,1,3.257614,0.997089,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6136197,0.396882,0,0,-0.277506,0,0,0,-0.386281,-0.50245,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6019545,0.908814,0,0,-0.930052,1,0,0,-0.386281,-0.50245,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5792247,0.762548,0,1,-0.712536,1,0,1,-0.386281,1.496936,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [97]:
X_val_log_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,Non-Zero Wage,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.616281,0,0,-0.567526,1,0,0,-0.386281,-0.50245,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6150876,1.347612,0,1,-1.220073,0,0,1,0.134276,-0.50245,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5397365,0.543148,0,1,-1.220073,0,0,0,-0.386281,-1.002297,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6077399,0.835681,0,1,-0.785042,1,0,1,2.216502,-1.502144,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5945251,-0.553848,0,0,0.592556,0,0,0,-0.386281,-1.502144,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [98]:
df_test_stand_log_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,Non-Zero Wage,...,Part of Body_7.0,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6165911,-1.723978,0,0,1.825144,0,0,0,-0.386281,-1.002297,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6166141,-1.723978,0,0,1.825144,0,0,0,-0.386281,-1.002297,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6165907,1.201346,0,0,-1.075062,0,0,0,-0.386281,-1.502144,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166047,0.908814,0,0,-0.785042,0,0,0,-0.386281,1.496936,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6166102,-1.285179,0,0,1.390113,0,0,0,-0.386281,0.997089,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


<hr>
<div class="alert alert-block alert-info" style="font-size:12px" id="export">
    
# 4.3 CSV Formatting, Exports
 
</a>

In [100]:
#checking the shape of X_train, X_val, y_train, y_val to make sure they match
print("X_train minmax shape:", X_train_minmax.shape)
print("X_train standard shape:", X_train_standard.shape)
print("X_train log minmax shape:", X_train_log_minmax.shape)
print("X_train log standard shape:", X_train_log_standard.shape)
print("y_train log standard shape:", y_train.shape)

print("X_val minmax shape:", X_val_minmax.shape)
print("X_val standard shape:", X_val_standard.shape)
print("X_val log minmax shape:", X_val_log_minmax.shape)
print("X_val log standard shape:", X_val_log_standard.shape)
print("y_val shape:", y_val.shape)


print('df_test_stand minmax shape:', df_test_stand_minmax.shape)
print('df_test_stand standard shape:', df_test_stand_standard.shape)
print('df_test_stand log minmax shape:', df_test_stand_log_minmax.shape)
print('df_test_stand log standard shape:', df_test_stand_log_standard.shape)


X_train minmax shape: (459181, 82)
X_train standard shape: (459181, 82)
X_train log minmax shape: (459181, 82)
X_train log standard shape: (459181, 82)
y_train log standard shape: (459181,)
X_val minmax shape: (114796, 82)
X_val standard shape: (114796, 82)
X_val log minmax shape: (114796, 82)
X_val log standard shape: (114796, 82)
y_val shape: (114796,)
df_test_stand minmax shape: (387975, 82)
df_test_stand standard shape: (387975, 82)
df_test_stand log minmax shape: (387975, 82)
df_test_stand log standard shape: (387975, 82)


In [101]:
#rejoining X and y to ensure indices are still matched when we re-import
#ensuring y_train is a Series
y_train = pd.Series(y_train)
y_val = pd.Series(y_val)

#first joining minmax 
df_train_minmax = pd.concat([X_train_minmax, y_train], axis=1)
df_val_minmax = pd.concat([X_val_minmax, y_val], axis=1)

#then standard
df_train_standard = pd.concat([X_train_standard, y_train], axis=1)
df_val_standard = pd.concat([X_val_standard, y_val], axis=1)

#now minmax of log-transformed
df_train_log_minmax = pd.concat([X_train_log_minmax, y_train], axis=1)
df_val_log_minmax = pd.concat([X_val_log_minmax, y_val], axis=1)

#and standard of log-transformed
df_train_log_standard = pd.concat([X_train_log_standard, y_train], axis=1)
df_val_log_standard = pd.concat([X_val_log_standard, y_val], axis=1)

In [102]:
df_train_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.523077,0,1,0.104511,0.5,1,0,0,0.057143,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4
6090033,0.630769,0,1,0.119001,0.430556,1,0,1,0.2,0.833333,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
6136197,0.507692,0,0,0.151184,0.541667,0,0,0,0.0,0.333333,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
6019545,0.615385,0,0,0.151184,0.416667,1,0,0,0.0,0.333333,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
5792247,0.584615,0,1,0.093635,0.458333,1,0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3


In [103]:
df_val_minmax.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.553846,0,0,0.151184,0.486111,1,0,0,0.0,0.333333,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3
6150876,0.707692,0,1,0.151184,0.361111,0,0,1,0.028571,0.333333,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4
5397365,0.538462,0,1,0.151184,0.361111,0,0,0,0.0,0.166667,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4
6077399,0.6,0,1,0.213076,0.444444,1,0,1,0.142857,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,5
5945251,0.307692,0,0,0.151184,0.708333,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2


In [104]:
df_train_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5785935,0.470015,0,1,-0.849727,-0.495021,1,0,0,0.654832,1.496936,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4
6090033,0.981947,0,1,-0.623731,-0.857547,1,0,1,3.257614,0.997089,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
6136197,0.396882,0,0,-0.121798,-0.277506,0,0,0,-0.386281,-0.50245,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
6019545,0.908814,0,0,-0.121798,-0.930052,1,0,0,-0.386281,-0.50245,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
5792247,0.762548,0,1,-1.019353,-0.712536,1,0,1,-0.386281,1.496936,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3


In [105]:
df_val_standard.head()

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,C-3 Date,COVID-19 Indicator,First Hearing Date,IME-4 Count,Number of Dependents,...,Part of Body_8.0,Part of Body_9.0,Part of Body_10.0,Age Group_Elderly,Age Group_Senior,Age Group_Teen,Accident Season_Spring,Accident Season_Summer,Accident Season_Winter,Claim Injury Type
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5947783,0.616281,0,0,-0.121798,-0.567526,1,0,0,-0.386281,-0.50245,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3
6150876,1.347612,0,1,-0.121798,-1.220073,0,0,1,0.134276,-0.50245,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4
5397365,0.543148,0,1,-0.121798,-1.220073,0,0,0,-0.386281,-1.002297,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4
6077399,0.835681,0,1,0.843491,-0.785042,1,0,1,2.216502,-1.502144,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,5
5945251,-0.553848,0,0,-0.121798,0.592556,0,0,0,-0.386281,-1.502144,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2


In [106]:
#saving the data frames

#minmax
df_train_minmax.to_csv('df_train_minmax.csv', index = True)
df_val_minmax.to_csv('df_val_minmax.csv', index = True)
df_test_stand_minmax.to_csv('df_test_stand_minmax.csv', index = True)

#standard
df_train_standard.to_csv('df_train_standard.csv', index = True)
df_val_standard.to_csv('df_val_standard.csv', index = True)
df_test_stand_standard.to_csv('df_test_stand_standard.csv', index = True)

#log dfs: minmax
df_train_log_minmax.to_csv('df_train_log_minmax.csv', index = True)
df_val_log_minmax.to_csv('df_val_log_minmax.csv', index = True)
df_test_stand_log_minmax.to_csv('df_test_log_stand_minmax.csv', index = True)

#log dfs: standard
df_train_log_standard.to_csv('df_train_log_standard.csv', index = True)
df_val_log_standard.to_csv('df_val_log_standard.csv', index = True)
df_test_stand_log_standard.to_csv('df_test_stand_log_standard.csv', index = True)