In [None]:
!pip install --q "snowflake-connector-python[pandas]"
!sudo pip install --q  snowflake-ml-python==1.0.11 -U
!pip install --upgrade --q snowflake-snowpark-python==1.9.0
!pip install --q seaborn
!pip install --upgrade --q xgboost==1.7.3
!pip install --upgrade --q numpy==1.24.3
!pip install --upgrade --q pandas==1.5.3
!pip install --upgrade --q anyio==3.5.0
!pip install --upgrade --q packaging==23.1
!pip install --upgrade --q scikit-learn==1.3.0
!pip install --upgrade --q typing-extensions==4.7.1
!pip install --upgrade --q cryptography==39.0.0
!pip install --upgrade --q fsspec==2023.9.2
!pip install --q xgboost
!pip install --q catboost
!pip install --q refractml

^C
[31mERROR: Operation cancelled by user[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')
import configparser

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier


from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [3]:
config = configparser.ConfigParser()
config.read("snowflake_connection.ini")

['snowflake_connection.ini']

In [4]:
connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    "password": f'{config["Snowflake"]["password"]}',
    #"password": os.getenv('snowflake_password'),
    "account": f'{config["Snowflake"]["account"]}',
    #"account": os.getenv('snowflake_account'),
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}

In [5]:
def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [6]:
df = session.table("AUTO_INSURANCE_CLAIMS_DATA_PRODUCT")
insurance_claim = df.to_pandas()

In [7]:
insurance_claim.columns = insurance_claim.columns.str.lower()

In [8]:
insurance_claim.columns

Index(['months_as_customer', 'customer_age', 'policy_number',
       'policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital_gains', 'capital_loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day', 'incident_time_of_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount_paid',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [9]:
insurance_claim.rename(columns = {'total_claim_amount_paid': 'total_claim_amount'}, inplace=True)

In [10]:
insurance_claim=insurance_claim.replace("?",np.NaN)

In [11]:
df = insurance_claim.copy()

In [12]:
# dropping columns which are not necessary for prediction

to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_location','incident_date',
           'incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year']

df.drop(to_drop, inplace=True, axis=1)

In [13]:
# separating the feature and target columns

X = df.drop('fraud_reported', axis = 1)
y = df['fraud_reported']

In [14]:
# extracting categorical columns
cat_df = X.select_dtypes(include = ['object'])

In [15]:
cat_df.columns

Index(['customer_age', 'policy_csl', 'insured_sex', 'insured_education_level',
       'insured_occupation', 'insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'incident_time_of_day', 'property_damage', 'police_report_available'],
      dtype='object')

In [16]:
# printing unique values of each column
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

customer_age: 
['47' '58' '35' '28' '55' '65' '53' '60' '32' '36' '46' '40' '22' '26'
 '44' '38' '43' '64' '30' '31' '54' '34' '49' '68' '33' '51' '56' '25'
 '29' '69' '37' '24' '57' '61' '50' '45' '21' '62' '66' '67' '63' '52'
 '48' '41' '59' '39' '19' '27' '20' '23' '42']

policy_csl: 
['500/1000' '250/500' '100/300']

insured_sex: 
['FEMALE' 'MALE']

insured_education_level: 
['JD' 'Associate' 'High School' 'PhD' 'MD' 'Masters' 'College']

insured_occupation: 
['exec-managerial' 'craft-repair' 'tech-support' 'other-service'
 'farming-fishing' 'adm-clerical' 'prof-specialty' 'priv-house-serv'
 'machine-op-inspct' 'armed-forces' 'sales' 'protective-serv'
 'transport-moving' 'handlers-cleaners']

insured_relationship: 
['husband' 'not-in-family' 'own-child' 'wife' 'unmarried' 'other-relative']

incident_type: 
['Single Vehicle Collision' 'Multi-vehicle Collision' 'Parked Car'
 'Vehicle Theft']

collision_type: 
['Details not Available' 'Front Collision' 'Rear Collision'
 'Side Collisio

In [17]:
cat_df = pd.get_dummies(cat_df, drop_first = True)

In [18]:
num_df = df.select_dtypes(include = ['int16','int8','int32','float64'])

In [19]:
num_df.columns

Index(['months_as_customer', 'policy_deductable', 'policy_annual_premium',
       'umbrella_limit', 'capital_gains', 'capital_loss',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'bodily_injuries', 'witnesses', 'total_claim_amount', 'injury_claim',
       'property_claim', 'vehicle_claim'],
      dtype='object')

In [20]:
# combining the Numerical and Categorical dataframes to get the final dataset

X = pd.concat([num_df, cat_df], axis = 1)

In [21]:
# splitting data into training set and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [22]:
num_df = X_train[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [23]:
num_df_test = X_test[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

In [24]:
# Scaling the numeric values in the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_df)

In [25]:
scaled_data_test = scaler.transform(num_df_test)

In [26]:
scaled_num_df = pd.DataFrame(data = scaled_data, columns = num_df.columns, index = X_train.index)
scaled_num_df.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
164105,-1.196353,-0.693595,1.25094,0.897808,0.089733,0.334628,2.175581,0.004458,1.343286,0.013726,0.980434,-0.83509
217052,2.089671,0.472819,1.142111,-0.379151,-0.777154,-0.176894,-0.677336,0.004458,0.446288,1.014866,0.25437,0.122605
30287,-0.65784,1.181265,-0.314274,0.522708,-0.697694,-0.006387,-0.677336,0.004458,-1.347708,-0.515548,-0.977701,-0.233739
207584,-0.130318,0.798068,0.493379,-1.139925,1.312325,-2.563995,-0.677336,-1.219328,0.446288,-1.003376,1.402233,-0.784877
144453,-0.295169,-0.20572,1.081388,-1.367791,0.296927,-0.176894,-0.677336,-1.219328,0.446288,-1.008557,-0.192233,-1.328485


In [27]:
scaled_num_df_test = pd.DataFrame(data = scaled_data_test, columns = num_df_test.columns, index = X_test.index)
scaled_num_df_test.head()

Unnamed: 0,months_as_customer,policy_deductable,umbrella_limit,capital_gains,capital_loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,injury_claim,property_claim,vehicle_claim
136124,-1.053482,1.04294,-0.961804,1.631394,-1.519457,0.334628,-0.677336,0.004458,-0.45071,-0.645906,0.901697,-1.109893
101818,-0.987542,0.955086,0.428933,-0.528328,-1.547497,1.187164,2.175581,1.228244,1.343286,1.262981,-0.621029,0.898587
79741,0.979676,-0.201981,-0.917624,-0.264392,-1.658691,-0.688415,-0.677336,1.228244,-0.45071,0.735527,-0.135883,-1.285035
152747,-1.009522,-0.192635,-0.491553,1.397666,1.063492,-1.02943,-0.677336,-1.219328,-1.347708,-1.026199,-0.936742,-1.170826
63685,-0.844671,-0.878651,-1.754924,-1.235589,-0.660587,0.164121,-0.677336,-1.219328,-0.45071,-1.023119,0.143578,0.378394


In [28]:
X_train.drop(columns = scaled_num_df.columns, inplace = True)

In [29]:
X_train = pd.concat([scaled_num_df, X_train], axis = 1)

In [30]:
X_test.drop(columns = scaled_num_df_test.columns, inplace = True)

In [31]:
X_test = pd.concat([scaled_num_df_test, X_test], axis = 1)

# Decision Tree

In [32]:
#from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [33]:
y_pred = dtc.predict(X_test)

In [34]:
dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Decision Tree is : {dtc_train_acc}")
print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.9946044700940803
[[ 8649   125]
 [  191 49602]]
                   precision    recall  f1-score   support

   Fraud Reported       0.98      0.99      0.98      8774
No Fraud Reported       1.00      1.00      1.00     49793

         accuracy                           0.99     58567
        macro avg       0.99      0.99      0.99     58567
     weighted avg       0.99      0.99      0.99     58567



In [35]:
from refractml import *
from refractml.constants import MLModelFlavours
import requests

In [36]:
actual_model_columns = ['months_as_customer',
 'policy_deductable',
 'umbrella_limit',
 'capital_gains',
 'capital_loss',
 'incident_hour_of_the_day',
 'number_of_vehicles_involved',
 'bodily_injuries',
 'witnesses',
 'injury_claim',
 'property_claim',
 'vehicle_claim',
 'policy_annual_premium',
 'total_claim_amount',
 'customer_age_20',
 'customer_age_21',
 'customer_age_22',
 'customer_age_23',
 'customer_age_24',
 'customer_age_25',
 'customer_age_26',
 'customer_age_27',
 'customer_age_28',
 'customer_age_29',
 'customer_age_30',
 'customer_age_31',
 'customer_age_32',
 'customer_age_33',
 'customer_age_34',
 'customer_age_35',
 'customer_age_36',
 'customer_age_37',
 'customer_age_38',
 'customer_age_39',
 'customer_age_40',
 'customer_age_41',
 'customer_age_42',
 'customer_age_43',
 'customer_age_44',
 'customer_age_45',
 'customer_age_46',
 'customer_age_47',
 'customer_age_48',
 'customer_age_49',
 'customer_age_50',
 'customer_age_51',
 'customer_age_52',
 'customer_age_53',
 'customer_age_54',
 'customer_age_55',
 'customer_age_56',
 'customer_age_57',
 'customer_age_58',
 'customer_age_59',
 'customer_age_60',
 'customer_age_61',
 'customer_age_62',
 'customer_age_63',
 'customer_age_64',
 'customer_age_65',
 'customer_age_66',
 'customer_age_67',
 'customer_age_68',
 'customer_age_69',
 'policy_csl_250/500',
 'policy_csl_500/1000',
 'insured_sex_MALE',
 'insured_education_level_College',
 'insured_education_level_High School',
 'insured_education_level_JD',
 'insured_education_level_MD',
 'insured_education_level_Masters',
 'insured_education_level_PhD',
 'insured_occupation_armed-forces',
 'insured_occupation_craft-repair',
 'insured_occupation_exec-managerial',
 'insured_occupation_farming-fishing',
 'insured_occupation_handlers-cleaners',
 'insured_occupation_machine-op-inspct',
 'insured_occupation_other-service',
 'insured_occupation_priv-house-serv',
 'insured_occupation_prof-specialty',
 'insured_occupation_protective-serv',
 'insured_occupation_sales',
 'insured_occupation_tech-support',
 'insured_occupation_transport-moving',
 'insured_relationship_not-in-family',
 'insured_relationship_other-relative',
 'insured_relationship_own-child',
 'insured_relationship_unmarried',
 'insured_relationship_wife',
 'incident_type_Parked Car',
 'incident_type_Single Vehicle Collision',
 'incident_type_Vehicle Theft',
 'collision_type_Front Collision',
 'collision_type_Rear Collision',
 'collision_type_Side Collision',
 'incident_severity_Minor Damage',
 'incident_severity_Total Loss',
 'authorities_contacted_Fire',
 'authorities_contacted_None',
 'authorities_contacted_Other',
 'authorities_contacted_Police',
 'incident_time_of_day_Early Morning Hours',
 'incident_time_of_day_Morning to Noon',
 'incident_time_of_day_Night Time',
 'property_damage_No Property Damage',
 'property_damage_Property Damage',
 'police_report_available_Police Report Available',
 'police_report_available_Unknown']

In [37]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    df = pd.DataFrame(payload_dict,index=[0])
    df = df.fillna(0)
    df.columns = df.columns.str.lower()
    df.drop(to_drop, inplace=True, axis=1)
    
    # extracting categorical columns
    cat_df = df[['customer_age','policy_csl','insured_sex','insured_education_level','insured_occupation','insured_relationship','incident_type','collision_type','incident_severity','authorities_contacted','incident_time_of_day','property_damage','police_report_available']]
    cat_df = pd.get_dummies(cat_df, drop_first = True)

    num_df = df[['months_as_customer', 'policy_deductable', 'policy_annual_premium',
       'umbrella_limit', 'capital_gains', 'capital_loss',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'bodily_injuries', 'witnesses', 'total_claim_amount', 'injury_claim',
       'property_claim', 'vehicle_claim']]
    
    X = pd.concat([num_df, cat_df], axis = 1)
    
    num_df_test = X[['months_as_customer', 'policy_deductable', 'umbrella_limit',
       'capital_gains', 'capital_loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim',
       'vehicle_claim']]

    scaled_data_test = scaler.transform(num_df_test)
    scaled_num_df_test = pd.DataFrame(data = scaled_data_test, columns = num_df_test.columns, index = X.index)
    
    X.drop(columns = scaled_num_df_test.columns, inplace = True)
    X = pd.concat([scaled_num_df_test, X], axis = 1)
    
    missing_features = [missing_col for missing_col in actual_model_columns if missing_col not in X.columns]
    X[missing_features] = 0
    
    prediction = model.predict(X[actual_model_columns])
    probability = model.predict_proba(X)[:,1]
    return {"prediction" : prediction, "probability" : probability}

In [38]:
insurance_claim.columns = insurance_claim.columns.str.upper()
insurance_claims = insurance_claim.copy()
insurance_claims.drop('FRAUD_REPORTED', axis = 1, inplace=True)
payload = insurance_claims.iloc[0].to_dict()

In [39]:
payload

{'MONTHS_AS_CUSTOMER': 181,
 'CUSTOMER_AGE': '47',
 'POLICY_NUMBER': 430602,
 'POLICY_BIND_DATE': '09-11-1992',
 'POLICY_STATE': 'CT',
 'POLICY_CSL': '500/1000',
 'POLICY_DEDUCTABLE': 702,
 'POLICY_ANNUAL_PREMIUM': 856,
 'UMBRELLA_LIMIT': 0,
 'INSURED_ZIP': '449352',
 'INSURED_SEX': 'FEMALE',
 'INSURED_EDUCATION_LEVEL': 'JD',
 'INSURED_OCCUPATION': 'exec-managerial',
 'INSURED_HOBBIES': 'polo',
 'INSURED_RELATIONSHIP': 'husband',
 'CAPITAL_GAINS': 74623,
 'CAPITAL_LOSS': -68050,
 'INCIDENT_DATE': datetime.date(2023, 8, 18),
 'INCIDENT_TYPE': 'Single Vehicle Collision',
 'COLLISION_TYPE': 'Details not Available',
 'INCIDENT_SEVERITY': 'Major Damage',
 'AUTHORITIES_CONTACTED': 'Fire',
 'INCIDENT_STATE': 'CT',
 'INCIDENT_CITY': 'Hartford',
 'INCIDENT_LOCATION': 'Oak',
 'INCIDENT_HOUR_OF_THE_DAY': 23,
 'INCIDENT_TIME_OF_DAY': 'Night Time',
 'NUMBER_OF_VEHICLES_INVOLVED': 1,
 'PROPERTY_DAMAGE': 'Property Damage',
 'BODILY_INJURIES': 1,
 'WITNESSES': 3,
 'POLICE_REPORT_AVAILABLE': 'Police Re

In [40]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
y_out = score(dtc, y_req)
y_out

{'prediction': array(['No Fraud Reported'], dtype=object),
 'probability': array([1.])}

In [49]:
insurance_claim.tail(100)

Unnamed: 0,MONTHS_AS_CUSTOMER,CUSTOMER_AGE,POLICY_NUMBER,POLICY_BIND_DATE,POLICY_STATE,POLICY_CSL,POLICY_DEDUCTABLE,POLICY_ANNUAL_PREMIUM,UMBRELLA_LIMIT,INSURED_ZIP,...,WITNESSES,POLICE_REPORT_AVAILABLE,TOTAL_CLAIM_AMOUNT,INJURY_CLAIM,PROPERTY_CLAIM,VEHICLE_CLAIM,AUTO_MAKE,AUTO_MODEL,AUTO_YEAR,FRAUD_REPORTED
234165,169,66,483447,18-11-2003,CT,100/300,604,686,3199228,464652,...,0,Unknown,77644.0,6993.0,4148.0,66503.0,Nissan,Maxima,2012,No Fraud Reported
234166,202,53,427455,02-04-1991,CT,500/1000,996,1383,2829397,458237,...,3,Police Report Available,94954.0,18311.0,16078.0,60565.0,Ford,F150,2013,No Fraud Reported
234167,47,44,466915,05-10-1994,CT,250/500,575,676,7946053,468232,...,2,Police Report Available,59171.0,8446.0,1054.0,49671.0,Volkswagen,Jetta,2009,No Fraud Reported
234168,100,55,452953,03-12-1996,CT,100/300,1991,2488,3068836,601425,...,3,Police Report Available,53853.0,515.0,913.0,52425.0,Chevrolet,Malibu,2001,No Fraud Reported
234169,61,38,463686,25-08-2001,CT,100/300,656,862,0,447300,...,2,No Police Report Available,69185.0,524.0,916.0,67745.0,Dodge,RAM,2010,No Fraud Reported
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234260,140,48,453291,04-12-2006,CT,250/500,1426,1638,8572585,458237,...,2,No Police Report Available,53439.0,18424.0,1086.0,33929.0,Saab,95,1995,No Fraud Reported
234261,53,26,470896,28-05-2006,CT,100/300,1048,1262,2784522,432399,...,0,Police Report Available,68728.0,9934.0,990.0,57804.0,BMW,X6,2014,Fraud Reported
234262,83,35,485806,27-12-1999,CT,500/1000,1817,2559,6042534,476203,...,3,Police Report Available,96213.0,6129.0,18861.0,71223.0,BMW,X5,2013,No Fraud Reported
234263,179,39,482814,07-06-2002,CT,100/300,947,1315,3707792,442142,...,1,Unknown,54810.0,600.0,9370.0,44840.0,Suburu,Legacy,2012,No Fraud Reported


In [59]:
y_pred = dtc.predict(X_test)
y_prob = dtc.predict_proba(X_test)[:,1]

In [60]:
y_pred

array(['No Fraud Reported', 'No Fraud Reported', 'No Fraud Reported', ...,
       'Fraud Reported', 'No Fraud Reported', 'No Fraud Reported'],
      dtype=object)

In [61]:
y_prob

array([1., 1., 1., ..., 0., 1., 1.])

In [41]:
## registering the model in refract.
model_reg = register_model(dtc, 
               score, 
               name="ILF_DecisionTree_FraudClassifier", 
               description="Insurance claim's model trained for Fraud Classification",
               flavour=MLModelFlavours.sklearn,
               model_type="regression",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_test,
               y_pred=y_pred, 
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train.tolist(),
               y_test=y_test.tolist(),
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)

IndentationError: unexpected indent (<ipython-input-41-e4616a1d3fee>, line 3)