In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import sklearn.metrics as skm

## Load Clean Data

In [43]:
df = pd.read_csv("tsa_claims_clean.csv")

df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Claim_Number,Airline_Name,Claim_Type,Claim_Site,Item,Status,Date_Received,Incident_Date,Airport_Code_Group,Airport_Name_Group,Claim_Value,Close_Value
0,0909802M,ContinentalAirlines,Property Damage,Checkpoint,Other,Approved,2002-01-04,2002-12-12,EWR,Newark International Airport,350.0,350.0
1,0202417M,,Property Damage,Checked Baggage,Luggage (all types including footlockers),Settled,2002-02-02,2004-01-16,SEA,Seattle-Tacoma International,100.0,50.0


## MVP Feature Engineering

In [44]:
df["Date_Received"] = pd.to_datetime(df.Date_Received,format="%Y-%m-%d")
df["Incident_Date"] = pd.to_datetime(df.Incident_Date,format="%Y-%m-%d")

df["Report_Delay"] = (df.Date_Received - df.Incident_Date).dt.days

df.head(2)

Unnamed: 0,Claim_Number,Airline_Name,Claim_Type,Claim_Site,Item,Status,Date_Received,Incident_Date,Airport_Code_Group,Airport_Name_Group,Claim_Value,Close_Value,Report_Delay
0,0909802M,ContinentalAirlines,Property Damage,Checkpoint,Other,Approved,2002-01-04,2002-12-12,EWR,Newark International Airport,350.0,350.0,-342
1,0202417M,,Property Damage,Checked Baggage,Luggage (all types including footlockers),Settled,2002-02-02,2004-01-16,SEA,Seattle-Tacoma International,100.0,50.0,-713


## MVP Baseline Model

Baseline RF model accuracy: ~45.7% (5 fold cross-test)

This is slightly worse than naively guessing the most common class "Denied" (46.7%). However, looking at precision and recall scores, we are picking up signal on the minority classes (Approved / Settled) so this is more *useful* than the naive model.

Adding additional variables / trying more models should help!

In [42]:
df.Status.value_counts().map(lambda x:[x,round(x/len(df),3)])

Denied      [67828, 0.467]
Approved     [45090, 0.31]
Settled     [32374, 0.223]
Name: Status, dtype: object

In [35]:
mvp_df=df[["Status","Claim_Value","Report_Delay"]].dropna()

X = np.array(mvp_df[["Claim_Value","Report_Delay"]])
Y = np.array(mvp_df.Status)

model = RandomForestClassifier(n_estimators = 500)

accuracies = []
scores = []

for train_ind, test_ind in KFold(4,shuffle=True,random_state=1).split(X,Y):
    model.fit(X[train_ind],Y[train_ind])
    pred = model.predict(X[test_ind])
    
    acc = skm.accuracy_score(Y[test_ind],pred)
    print(acc)
    accuracies.append(acc)
    
    print(skm.classification_report(Y[test_ind],pred))
    score = skm.precision_recall_fscore_support(Y[test_ind],pred)
    scores.append(score)

print (np.average(accuracies))
print (np.average(scores,axis=2))


0.40151159904
             precision    recall  f1-score   support

   Approved       0.53      0.44      0.48     13217
     Denied       0.34      0.67      0.45     10649
    Settled       0.39      0.13      0.20     12387

avg / total       0.43      0.40      0.37     36253

0.487849281439
             precision    recall  f1-score   support

   Approved       0.52      0.52      0.52     11626
     Denied       0.54      0.59      0.56     16957
    Settled       0.27      0.22      0.24      7670

avg / total       0.48      0.49      0.48     36253

0.484346123079
             precision    recall  f1-score   support

   Approved       0.33      0.57      0.41      6929
     Denied       0.72      0.52      0.60     23610
    Settled       0.19      0.23      0.21      5714

avg / total       0.56      0.48      0.50     36253

0.458031059499
             precision    recall  f1-score   support

   Approved       0.51      0.44      0.47     13249
     Denied       0.51      0.