In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

Read data

In [2]:
data_path = Path("../data/raw/database.csv")
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,accident_date,call_source,call_source_other,call_sub_source,creation_date,description_of_the_accident,did_the_ambulance_arrive_at_the_scene,did_the_ambulance_take_you_toahospital,did_the_police_arrived_at_the_scene,did_the_police_fileareport,did_you_get_injured_byaslip_or_fall_accident,did_you_have_an_accident_at_work,how_you_were_involved,injury_complaints,lawyer_status,state_accident_occur,were_the_police_call,were_you_affected_by_possible_malpractice,were_you_involved_in_an_automobile_accident,can_you_briefly_describe_how_the_accident_occur
0,,Cantaso,,CANTASO LLAMADA,2025-06-13 21:51:18,"she was at a costco supermarket, when another ...",,,,,FALSO,FALSO,,,Rejected - No case,,,FALSO,FALSO,"she was at a costco supermarket, when another ..."
1,2025-03-10,Trabajo,,800 TRABAJO LEADGEN MAX FACEBOOK,2025-06-12 16:06:10,The client was working at a nursing home along...,FALSO,,,,,VERDADERO,,"Right leg pain radiating,Right shoulder pain,L...",Retained,New York,FALSO,,FALSO,The client was working at a nursing home along...
2,2024-11-24,911,,911AUTO-WHATSAPP,2025-06-13 0:10:17,The client was driving home from work when a 2...,FALSO,,VERDADERO,VERDADERO,,,,"Right leg pain radiating,Neck pain,Other,Back ...",Rejected - No case,New Jersey,VERDADERO,,VERDADERO,The client was driving home from work when a 2...
3,,Cantaso,,CANTASO ALEX WHATSAPP,2025-06-25 22:44:43,,,,,,,,,,Client non-compliant,,,,,
4,,Cantaso,,CANTASO ALEX OLD FACEBOOK,2025-09-04 13:42:20,,,,,,,,,,Client non-compliant,,,,,


In [3]:
df.describe()

Unnamed: 0,accident_date,call_source,call_source_other,call_sub_source,creation_date,description_of_the_accident,did_the_ambulance_arrive_at_the_scene,did_the_ambulance_take_you_toahospital,did_the_police_arrived_at_the_scene,did_the_police_fileareport,did_you_get_injured_byaslip_or_fall_accident,did_you_have_an_accident_at_work,how_you_were_involved,injury_complaints,lawyer_status,state_accident_occur,were_the_police_call,were_you_affected_by_possible_malpractice,were_you_involved_in_an_automobile_accident,can_you_briefly_describe_how_the_accident_occur
count,5496,10150,44,10127,10151,4634,3339,1564,2192,1918,2164,3688,811,3512,10151,5576,3938,1571,6246,4634
unique,877,9,33,285,9774,3798,2,2,2,2,2,2,7,1080,13,44,2,2,2,3798
top,2025-11-21,Cantaso,LA SPANISH,CANTASO LLAMADA,2025-10-05 23:17:02,"\nThe client, Mrs. Erika, was a passenger in t...",FALSO,VERDADERO,VERDADERO,VERDADERO,FALSO,FALSO,Passenger,Other,Rejected - No case,New York,VERDADERO,FALSO,FALSO,"\nThe client, Mrs. Erika, was a passenger in t..."
freq,42,6094,6,2438,5,6,1748,1340,1992,1734,1364,1865,437,387,5910,3792,2174,1161,3163,6


In [4]:
df = df.drop(df.index[-1])

Drop unnecessary columns

In [5]:
df = df.drop(columns = [
    'call_source', 
    'call_source_other', 
    'call_sub_source', 
    'creation_date', 
    'description_of_the_accident', 
    'injury_complaints', 
    'can_you_briefly_describe_how_the_accident_occur',
    'did_the_police_arrived_at_the_scene',
    'did_the_ambulance_take_you_toahospital',
    'did_the_ambulance_arrive_at_the_scene',
    'did_the_police_fileareport',
    'were_the_police_call'
])

Drop null values from state_accident_occur

In [6]:
df = df.dropna(subset=['state_accident_occur'])

Format accident_date

In [7]:
df = df[
    (df['accident_date'] != 'aN/aN/NaN') &
    (df['accident_date'].notna())
]

In [8]:
df = df.replace(r'^\s*$', np.nan, regex=True).infer_objects(copy=False).reset_index(drop=True)
df.head()

Unnamed: 0,accident_date,did_you_get_injured_byaslip_or_fall_accident,did_you_have_an_accident_at_work,how_you_were_involved,lawyer_status,state_accident_occur,were_you_affected_by_possible_malpractice,were_you_involved_in_an_automobile_accident
0,2025-03-10,,VERDADERO,,Retained,New York,,FALSO
1,2024-11-24,,,,Rejected - No case,New Jersey,,VERDADERO
2,2023-08-10,VERDADERO,FALSO,,Rejected - No case,New York,,FALSO
3,2025-06-20,,,,Client non-compliant,New York,,VERDADERO
4,2025-05-21,,VERDADERO,,Rejected - No case,New York,,FALSO


Correct row value 

In [9]:
df.loc[df['accident_date'] == '0225-03-02', 'accident_date'] = '2025-03-02'
df.iloc[27]

accident_date                                            2025-03-02
did_you_get_injured_byaslip_or_fall_accident                  FALSO
did_you_have_an_accident_at_work                              FALSO
how_you_were_involved                                           NaN
lawyer_status                                   Pending Full Intake
state_accident_occur                                       New York
were_you_affected_by_possible_malpractice                 VERDADERO
were_you_involved_in_an_automobile_accident                   FALSO
Name: 27, dtype: object

Obtain days_since_accident from accident_date

In [10]:
df['accident_date'] = pd.to_datetime(df['accident_date'])
df['accident_date'] = (pd.Timestamp.today().normalize() - df['accident_date']).dt.days
df = df.rename(columns={'accident_date': 'days_since_accident'})

Discard rows where state_accident_occur is not New York or New Jersey

In [11]:
df = df.loc[df['state_accident_occur'].isin(['New York', 'New Jersey'])]

Compare each value to Retained and return 0 or 1

In [12]:
df['lawyer_status'] = (df['lawyer_status'] == 'Retained').astype(int)

Change column type

In [13]:
df['were_you_involved_in_an_automobile_accident'] = df['were_you_involved_in_an_automobile_accident'].astype(object)

In [14]:
df = df.reset_index(drop = True)

Change values from true to 1 and false to 0

In [15]:
#df['did_the_ambulance_arrive_at_the_scene'] = (df['did_the_ambulance_arrive_at_the_scene'].map({True: 1, False: 0}))

In [16]:
#df['did_the_ambulance_take_you_toahospital'] = (df['did_the_ambulance_take_you_toahospital'].map({True: 1, False: 0}))

In [17]:
#df['did_the_police_arrived_at_the_scene'] = (df['did_the_police_arrived_at_the_scene'].map({True: 1, False: 0}))

In [18]:
#df['did_the_police_fileareport'] = (df['did_the_police_fileareport'].map({True: 1, False: 0}))

In [19]:
df['did_you_get_injured_byaslip_or_fall_accident'] = (df['did_you_get_injured_byaslip_or_fall_accident'].map({True: 1, False: 0}))

In [20]:
df['did_you_have_an_accident_at_work'] = (df['did_you_have_an_accident_at_work'].map({True: 1, False: 0}))

In [21]:
#df['were_the_police_call'] = (df['were_the_police_call'].map({True: 1, False: 0}))

In [22]:
df['were_you_affected_by_possible_malpractice'] = (df['were_you_affected_by_possible_malpractice'].map({True: 1, False: 0}))

In [23]:
df['were_you_involved_in_an_automobile_accident'] = (df['were_you_involved_in_an_automobile_accident'].map({True: 1, False: 0}))

Change null values on how_you_were_involved to Not_involved

In [24]:
df["how_you_were_involved"] = df["how_you_were_involved"].fillna("Not_involved")

In [25]:
df.head()

Unnamed: 0,days_since_accident,did_you_get_injured_byaslip_or_fall_accident,did_you_have_an_accident_at_work,how_you_were_involved,lawyer_status,state_accident_occur,were_you_affected_by_possible_malpractice,were_you_involved_in_an_automobile_accident
0,346,,,Not_involved,1,New York,,
1,452,,,Not_involved,0,New Jersey,,
2,924,,,Not_involved,0,New York,,
3,244,,,Not_involved,0,New York,,
4,274,,,Not_involved,0,New York,,


Encode categorial columns with One Hot Encoder

In [26]:
categorical_columns = ['how_you_were_involved', 'state_accident_occur']
encoder = OneHotEncoder(sparse_output = False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns = encoder.get_feature_names_out(categorical_columns))
encoded_df = pd.concat([df, one_hot_df], axis = 1)
encoded_df = encoded_df.drop(categorical_columns, axis = 1)
encoded_df.head()

Unnamed: 0,days_since_accident,did_you_get_injured_byaslip_or_fall_accident,did_you_have_an_accident_at_work,lawyer_status,were_you_affected_by_possible_malpractice,were_you_involved_in_an_automobile_accident,how_you_were_involved_Bicyclist,how_you_were_involved_E-Bike,how_you_were_involved_Ebike with pedal assist,how_you_were_involved_Motorcyclist,how_you_were_involved_Not_involved,how_you_were_involved_Passenger,how_you_were_involved_Pedestrian,how_you_were_involved_Standing Scooter,state_accident_occur_New Jersey,state_accident_occur_New York
0,346,,,1,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,452,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,924,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,244,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,274,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


Save encoder on artifacts

In [27]:
ARTIFACTS_PATH = Path("../model/artifacts")
ARTIFACTS_PATH.mkdir(parents = True, exist_ok = True)
joblib.dump(encoder, ARTIFACTS_PATH / "onehot_encoder.joblib")

['../model/artifacts/onehot_encoder.joblib']

Scale numeric columns

In [28]:
scaler = StandardScaler() 
scaled_days_since_accident = encoded_df[['days_since_accident']]
scaled_array = scaler.fit_transform(scaled_days_since_accident)
encoded_df['days_since_accident'] = scaled_array
encoded_df.head()

Unnamed: 0,days_since_accident,did_you_get_injured_byaslip_or_fall_accident,did_you_have_an_accident_at_work,lawyer_status,were_you_affected_by_possible_malpractice,were_you_involved_in_an_automobile_accident,how_you_were_involved_Bicyclist,how_you_were_involved_E-Bike,how_you_were_involved_Ebike with pedal assist,how_you_were_involved_Motorcyclist,how_you_were_involved_Not_involved,how_you_were_involved_Passenger,how_you_were_involved_Pedestrian,how_you_were_involved_Standing Scooter,state_accident_occur_New Jersey,state_accident_occur_New York
0,0.095197,,,1,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.287891,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.145921,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.090224,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-0.035689,,,0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


Save scaler on artifacts

In [29]:
joblib.dump(scaler, ARTIFACTS_PATH / "scaler.joblib")

['../model/artifacts/scaler.joblib']

Reorder columns

In [30]:
columns = encoded_df.columns.tolist()
columns[0], columns[8] = columns[8], columns[0]
final_df = encoded_df[columns]
final_df.head(10)

Unnamed: 0,how_you_were_involved_Ebike with pedal assist,did_you_get_injured_byaslip_or_fall_accident,did_you_have_an_accident_at_work,lawyer_status,were_you_affected_by_possible_malpractice,were_you_involved_in_an_automobile_accident,how_you_were_involved_Bicyclist,how_you_were_involved_E-Bike,days_since_accident,how_you_were_involved_Motorcyclist,how_you_were_involved_Not_involved,how_you_were_involved_Passenger,how_you_were_involved_Pedestrian,how_you_were_involved_Standing Scooter,state_accident_occur_New Jersey,state_accident_occur_New York
0,0.0,,,1,,,0.0,0.0,0.095197,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,,,0,,,0.0,0.0,0.287891,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,,,0,,,0.0,0.0,1.145921,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,,,0,,,0.0,0.0,-0.090224,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,,,0,,,0.0,0.0,-0.035689,0.0,1.0,0.0,0.0,0.0,0.0,1.0
5,0.0,,,0,,,0.0,0.0,-0.061139,0.0,1.0,0.0,0.0,0.0,0.0,1.0
6,0.0,,,0,,,0.0,0.0,-0.3829,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,0.0,,,0,,,0.0,0.0,2.874706,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8,0.0,,,0,,,0.0,0.0,0.075201,0.0,1.0,0.0,0.0,0.0,0.0,1.0
9,0.0,,,0,,,0.0,0.0,21.720465,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [31]:
print(final_df.columns.tolist())

['how_you_were_involved_Ebike with pedal assist', 'did_you_get_injured_byaslip_or_fall_accident', 'did_you_have_an_accident_at_work', 'lawyer_status', 'were_you_affected_by_possible_malpractice', 'were_you_involved_in_an_automobile_accident', 'how_you_were_involved_Bicyclist', 'how_you_were_involved_E-Bike', 'days_since_accident', 'how_you_were_involved_Motorcyclist', 'how_you_were_involved_Not_involved', 'how_you_were_involved_Passenger', 'how_you_were_involved_Pedestrian', 'how_you_were_involved_Standing Scooter', 'state_accident_occur_New Jersey', 'state_accident_occur_New York']


Split data

In [32]:
X = final_df.drop('lawyer_status', axis=1)
y = final_df['lawyer_status']

In [33]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size = 0.30,
    random_state = 42,
    shuffle = True,
    stratify = y
)

In [34]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size = 0.50,
    random_state = 42,
    shuffle = True,
    stratify = y_temp
)

In [35]:
train_df = pd.concat([y_train.reset_index(drop = True), X_train.reset_index(drop = True)], axis = 1)
validation_df = pd.concat([y_val.reset_index(drop = True), X_val.reset_index(drop = True)], axis = 1)

Save feature columns on artifacts

In [36]:
feature_columns = X_train.columns.tolist()
joblib.dump(feature_columns, ARTIFACTS_PATH / "feature_columns.joblib")
print(feature_columns)

['how_you_were_involved_Ebike with pedal assist', 'did_you_get_injured_byaslip_or_fall_accident', 'did_you_have_an_accident_at_work', 'were_you_affected_by_possible_malpractice', 'were_you_involved_in_an_automobile_accident', 'how_you_were_involved_Bicyclist', 'how_you_were_involved_E-Bike', 'days_since_accident', 'how_you_were_involved_Motorcyclist', 'how_you_were_involved_Not_involved', 'how_you_were_involved_Passenger', 'how_you_were_involved_Pedestrian', 'how_you_were_involved_Standing Scooter', 'state_accident_occur_New Jersey', 'state_accident_occur_New York']


Upload datsets to S3 bucket

In [38]:
PROCESSED_PATH = Path("../data/processed")
PROCESSED_PATH.mkdir(parents = True, exist_ok = True)
train_df.to_csv(PROCESSED_PATH/'train.csv', index=False, header=False)
validation_df.to_csv(PROCESSED_PATH/'validation.csv', index=False, header=False)
X_test.to_csv(PROCESSED_PATH/'X_test.csv', index=False)
y_test.to_csv(PROCESSED_PATH/'y_test.csv', index=False)