In [1]:
import pandas as pd

TLC = '/kaggle/input/driver-application-status/TLC_New_Driver_Application_Status.csv'
df = pd.read_csv(filepath_or_buffer=TLC, parse_dates=['App Date', ], index_col=['App No']).drop(columns=['Last Updated'])
df['year'] = df['App Date'].dt.year
df.head()

Unnamed: 0_level_0,Type,App Date,Status,FRU Interview Scheduled,Drug Test,WAV Course,Defensive Driving,Driver Exam,Medical Clearance Form,Other Requirements,year
App No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6078403,HDR,2024-07-16,Incomplete,Not Applicable,Needed,Needed,Needed,Needed,Needed,Fingerprints needed; Copy of DMV license needed,2024
6077989,HDR,2024-07-09,Incomplete,Not Applicable,Needed,Needed,Complete,Needed,Needed,Fingerprints needed; Copy of DMV license needed,2024
6074279,HDR,2024-05-09,Incomplete,Not Applicable,Needed,Needed,Complete,Needed,Needed,Fingerprints needed; Copy of DMV license needed,2024
5735329,HDR,2016-10-17,Incomplete,Not Applicable,Complete,Complete,Complete,Needed,Complete,Copy of DMV license needed,2016
6077157,HDR,2024-06-24,Incomplete,Not Applicable,Complete,Complete,Complete,Needed,Complete,Not Applicable,2024


In [2]:
df['year'].value_counts().to_dict()

{2024: 3481, 2018: 8, 1997: 3, 2017: 2, 2016: 1, 2023: 1}

We have almost no records from any year other than 2024. Maybe the 1997 records are just noice.

In [3]:
df[df['year'] == 1997]

Unnamed: 0_level_0,Type,App Date,Status,FRU Interview Scheduled,Drug Test,WAV Course,Defensive Driving,Driver Exam,Medical Clearance Form,Other Requirements,year
App No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5921774,PDR,1997-04-28,Incomplete,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Fingerprints & other items needed. Visit www.n...,1997
5921772,PDR,1997-04-28,Incomplete,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Fingerprints & other items needed. Visit www.n...,1997
5921773,PDR,1997-04-28,Incomplete,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Fingerprints & other items needed. Visit www.n...,1997


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3496 entries, 6078403 to 6078293
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Type                     3496 non-null   object        
 1   App Date                 3496 non-null   datetime64[ns]
 2   Status                   3496 non-null   object        
 3   FRU Interview Scheduled  3496 non-null   object        
 4   Drug Test                3496 non-null   object        
 5   WAV Course               3496 non-null   object        
 6   Defensive Driving        3496 non-null   object        
 7   Driver Exam              3496 non-null   object        
 8   Medical Clearance Form   3496 non-null   object        
 9   Other Requirements       3496 non-null   object        
 10  year                     3496 non-null   int32         
dtypes: datetime64[ns](1), int32(1), object(9)
memory usage: 314.1+ KB


In [5]:
df.nunique()

Type                         3
App Date                   135
Status                       5
FRU Interview Scheduled     27
Drug Test                    3
WAV Course                   3
Defensive Driving            3
Driver Exam                  3
Medical Clearance Form       3
Other Requirements           8
year                         6
dtype: int64

I like pie charts so I'm going to make a pie chart.

In [6]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.pie(data_frame=df['Status'].value_counts().to_frame().reset_index(), names='Status', values='count', color='count')

If we are going to choose the status as our target class we're going to have to deal with the fact that the status class is unbalanced, really only having two classes: complete and incomplete.

In [7]:
df['Type'].value_counts().to_dict()

{'HDR': 3483, 'PDR': 11, 'VDR': 2}

A cursory web search suggests that 
* HDR stands for Medallion/For-Hire Vehicle Operator 
* PDR stands for Paratransit Vehicle Operator
* VDR stands for Commuter Van Vehicle Operator
So we know that none of these are ordinary driver's licenses.

In [8]:
from plotly import express

express.histogram(data_frame=df, x='App Date', log_y=True)

In [9]:
df.columns

Index(['Type', 'App Date', 'Status', 'FRU Interview Scheduled', 'Drug Test',
       'WAV Course', 'Defensive Driving', 'Driver Exam',
       'Medical Clearance Form', 'Other Requirements', 'year'],
      dtype='object')

In [10]:
TARGET = 'Status'
COLUMNS = ['Drug Test', 'WAV Course', 'Defensive Driving', 'Driver Exam', 'Medical Clearance Form',]

model_df = pd.get_dummies(data=df[COLUMNS + [TARGET]], columns=COLUMNS)
model_df.head()

Unnamed: 0_level_0,Status,Drug Test_Complete,Drug Test_Needed,Drug Test_Not Applicable,WAV Course_Complete,WAV Course_Needed,WAV Course_Not Applicable,Defensive Driving_Complete,Defensive Driving_Needed,Defensive Driving_Not Applicable,Driver Exam_Complete,Driver Exam_Needed,Driver Exam_Not Applicable,Medical Clearance Form_Complete,Medical Clearance Form_Needed,Medical Clearance Form_Not Applicable
App No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6078403,Incomplete,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False
6077989,Incomplete,False,True,False,False,True,False,True,False,False,False,True,False,False,True,False
6074279,Incomplete,False,True,False,False,True,False,True,False,False,False,True,False,False,True,False
5735329,Incomplete,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False
6077157,Incomplete,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False


In [11]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
model_df[['x', 'y']] = umap.fit_transform(X=model_df.drop(columns=[TARGET]))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-08-03 15:10:12.908919: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-03 15:10:12.909053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-03 15:10:13.071044: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sat Aug  3 15:10:24 2024 Construct fuzzy simplicial set
Sat Aug  3 15:10:31 2024 Finding Nearest Neighbors
Sat Aug  3 15:10:35 2024 Finished Nearest Neighbor Search
Sat Aug  3 15:10:39 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sat Aug  3 15:11:04 2024 Finished embedding
done with UMAP in 0:00:40.369748


In [12]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=model_df, x='x', y='y', color=TARGET)

We see a lot of local clustering. Let's build a model.

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(model_df.drop(columns=[TARGET]), model_df[TARGET], test_size=0.2, random_state=2024, stratify=model_df[TARGET])
model = LogisticRegression(max_iter=10000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 2108 iterations
accuracy: 0.9100


In [14]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

                           precision    recall  f1-score   support

Approved - License Issued       0.78      1.00      0.88       105
                   Denied       0.00      0.00      0.00        34
               Incomplete       0.94      0.97      0.96       546
Pending Fitness Interview       0.00      0.00      0.00         4
             Under Review       0.00      0.00      0.00        11

                 accuracy                           0.91       700
                macro avg       0.34      0.39      0.37       700
             weighted avg       0.85      0.91      0.88       700



Not surprisingly our model can find the two large classes but not the other two.