In [1]:
import pandas as pd
import datetime as dt
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,classification_report, auc, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_prepared_data():
    df_final = pd.read_csv("/data/prepared.csv")
    return df_final

In [3]:
def train_test(df_final):
    X = df_final.drop(columns='is_fraud')
    y = df_final['is_fraud']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42,stratify = y)
    global encoder
    X_train.reset_index(drop=True,inplace=True)
    encoder = OneHotEncoder(sparse=False,drop='first')
    df_encoded = pd.DataFrame (encoder.fit_transform(X_train[['category']]))
    df_encoded.columns = encoder.get_feature_names(['category'])
    X_train.drop(['category'] ,axis=1, inplace=True)
    X_train= pd.concat([X_train, df_encoded ], axis=1)
    
    X_test.reset_index(drop=True,inplace=True)
    df_encoded = pd.DataFrame (encoder.transform(X_test[['category']]))
    df_encoded.columns = encoder.get_feature_names(['category'])
    X_test.drop(['category'] ,axis=1, inplace=True)
    X_test= pd.concat([X_test, df_encoded ], axis=1)
    
    return X_train, X_test, y_train, y_test
    

In [4]:
def hyper_parameter_tunning(X_train, X_test, y_train, y_test):
    n_estimators = [100,200,300,50,500,400]
    max_features = [0.3,0.2,0.7,1.0,0.6,0.8,0.9]
    max_depth = [10,5,8,None]
    max_samples = [0.25,0.50,0.75,1.0,0.85]
    bootstrap = [True, False]
    
    grid_param = {
    'n_estimators':n_estimators,
    "max_features":max_features,
    "max_depth":max_depth,
    "bootstrap":bootstrap
    }
    random_forest_grid = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                           param_distributions=grid_param,
                                           cv=5,
                                           verbose=2,
                                           n_jobs=-1,random_state=42)
    random_forest_grid.fit(X_train,y_train)

    print("best param\n",random_forest_grid.best_params_)
    print("best score\n",random_forest_grid.best_score_)
    predicted = random_forest_grid.predict(X_test)
    print('Classification report:\n', classification_report(y_test, predicted))
    conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
    print('Confusion matrix:\n', conf_mat)
    
    return random_forest_grid
    


In [5]:
df_final = read_prepared_data()
X_train, X_test, y_train, y_test = train_test(df_final)
random_forest_grid = hyper_parameter_tunning(X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
best param
 {'n_estimators': 300, 'max_features': 0.7, 'max_depth': None, 'bootstrap': True}
best score
 0.9770741952473847
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1931
           1       0.98      0.98      0.98      1930

    accuracy                           0.98      3861
   macro avg       0.98      0.98      0.98      3861
weighted avg       0.98      0.98      0.98      3861

Confusion matrix:
 [[1901   30]
 [  29 1901]]


In [6]:
randforest_model_V1 = RandomForestClassifier(n_estimators=3,max_features=0.01,max_depth=None,bootstrap=True,n_jobs=-1)
randforest_model_V1.fit(X_train,y_train)
predicted=randforest_model_V1.predict(X_test)
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)

Classification report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.92      1931
           1       0.93      0.92      0.92      1930

    accuracy                           0.92      3861
   macro avg       0.92      0.92      0.92      3861
weighted avg       0.92      0.92      0.92      3861

Confusion matrix:
 [[1792  139]
 [ 153 1777]]


In [7]:
randforest_model = RandomForestClassifier(n_estimators=5,max_features=0.2,max_depth=None,bootstrap=True,n_jobs=-1)
randforest_model.fit(X_train,y_train)
predicted=randforest_model.predict(X_test)
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      1931
           1       0.97      0.96      0.97      1930

    accuracy                           0.97      3861
   macro avg       0.97      0.97      0.97      3861
weighted avg       0.97      0.97      0.97      3861

Confusion matrix:
 [[1876   55]
 [  72 1858]]


In [8]:
from mosaicml import *
from mosaicml.constants import MLModelFlavours

In [42]:
df_payload = df_final[['amt','zip','lat','long','city_pop','merch_lat','merch_long','age','hour','day','month','category']].copy()
payload_list = df_payload.head(1).values.flatten().tolist()
print("Raw Payload\n",payload_list)
encoding_cat = encoder.transform([[payload_list[-1]]])
payload = payload_list[:-1]+list(encoding_cat.flat)
payload

Raw Payload
 [281.06, 28611, 35.9946, -81.7266, 885, 36.430124, -81.17948299999998, 31, 1, 2, 1, 'grocery_pos']


[281.06,
 28611,
 35.9946,
 -81.7266,
 885,
 36.430124,
 -81.17948299999998,
 31,
 1,
 2,
 1,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [43]:
@scoring_func
def score(model, request):
    payload_list = request.json["payload"]
    encoding_cat = encoder.transform([[payload_list[-1]]])
    payload = payload_list[:-1]+list(encoding_cat.flat)
    prediction = pd.Series(model.predict([payload]))[0]
    return "Non Fraud Transcation" if prediction == 0 else "Fraud Transcation"

In [44]:
import requests
req = requests.Request()
req.json = {"payload":payload_list}

In [45]:
score(randforest_model,req)

'Fraud Transcation'

In [46]:
Y_pred = randforest_model.predict(X_test)
y_prob = randforest_model.predict_proba(X_test)[:,1]

In [47]:
Y_pred_v1 = randforest_model_V1.predict(X_test)
y_prob_v1 = randforest_model_V1.predict_proba(X_test)[:,1]

In [48]:
abc= register_model(randforest_model,
score,
"Credit_Card_Fraud_Detection_Model",
"Classification Undersampling",
MLModelFlavours.sklearn,
model_type='classification',
init_script="pip install scikit-learn==0.24.2 && pip install pandas==1.3.5",
y_true= y_test,
y_pred=Y_pred,
prob=y_prob,
x_test=X_test,
y_test=Y_pred,
features = X_train.columns.tolist(),
original_features= X_train.columns.tolist(),
model_display=True,
kyd=True,
kyd_score=True,
explain_ai=True,
x_train=X_train,
y_train=y_train,
feature_names=X_train.columns.tolist(), #()
feature_ids=X_train.columns.tolist(),
target_names=[0,1]
)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…

#### No Fraud
{"payload":[4.97,28654,36.0788,-81.1781,3495,3.0112,-82.048315,35,0,1,1,"misc_net"]}

##### Fraud
{"payload": [281.06,28611,35.9946,-81.7266,885,36.430124,-81.17948299999998,31,1,2,1,"grocery_pos"]}