In [27]:
import pandas as pd
import sqlite3
import onnx # not used but can be used to inspect and manipulate saved onnx model
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from sklearn.ensemble import RandomForestClassifier
conn = sqlite3.connect('intex2.db')
cursor = conn.cursor()
read_sql = "SELECT * from Orders"
df = pd.read_sql_query(read_sql, conn)
df


#This has all the functions
def bin_categories(df, features=[], cutoff=0.05, replace_with='Other', messages=False):

  import pandas as pd

  if len(features) == 0: features = df.columns

  for feat in features:
    if feat in df.columns:
      if not pd.api.types.is_numeric_dtype(df[feat]):
        other_list = df[feat].value_counts()[df[feat].value_counts() / df.shape[0] < cutoff].index
        df.loc[df[feat].isin(other_list), feat] = replace_with
        if messages: print(f'{feat} has been binned by setting {other_list} to {replace_with}')
    else:
      if messages: print(f'{feat} not found in the DataFrame provided. No binning performed')

  return df

def Xandy(df, label):
    import pandas as pd
    y = df[label]
    X = df.drop(columns=[label])
    return X, y

def dummy_code(X):
    import pandas as pd
    X = pd.get_dummies(X.copy(), drop_first=True)
    return X

def missing_data(df, label, row_thresh = 0.7, col_thresh = 0.9, random=False, random_state=3):
    import pandas as pd
    df.dropna(axis='rows', subset=[label], inplace=True)
    df.dropna(axis='columns', thresh=1, inplace=True)
    df.dropna(axis='rows', thresh=1, inplace=True)
    df.dropna(axis='columns', thresh=round(df.shape[0] * row_thresh), inplace=True)
    df.dropna(axis='rows', thresh=round(df.shape[1] * col_thresh), inplace=True)
    #impute values
    if df.isna().sum().sum() > 0:
        from sklearn.experimental import enable_iterative_imputer
        from sklearn.impute import IterativeImputer, KNNImputer
        X, y = Xandy(df, label)
        X = dummy_code(X.copy())
        if random: random_state = 0
        imp = IterativeImputer(max_iter=10, random_state=random_state)
        X = pd.DataFrame(imp.fit_transform(X), columns = X.columns, index = X.index)
        df = X.merge(y, left_index=True, right_index=True)
    else:
        X, y = Xandy(df, label)
        X = dummy_code(X.copy())
        df = X.merge(y, left_index=True, right_index=True)
    return df

def fit_cv_model(df, label, k=5, repeat=True, random=False, random_state=3, messages=False):
    from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
    import pandas as pd
    import numpy as np
    X, y = Xandy(df, label)
    X = dummy_code(X.copy())
    if repeat:
        cv = RepeatedKFold(n_splits=k, n_repeats=5)
    else:
        cv = KFold(n_splits=k)
    if random==True: random_state = 0
    
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import RidgeClassifier, LogisticRegression
    model_rfc = RandomForestClassifier(random_state=random_state)
    model_ridge = RidgeClassifier(random_state=random_state)
    model_gbc = GradientBoostingClassifier(random_state=random_state)
    model_log = LogisticRegression(random_state=random_state, max_iter=10000)
    scores_rfc = cross_val_score(model_rfc, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores_ridge = cross_val_score(model_ridge, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores_gbc = cross_val_score(model_gbc, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    scores_log = cross_val_score(model_log, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    if messages == True:
        print(f'Accuracy (RandomForest):\t{np.mean(scores_rfc)}')
        print(f'Accuracy (Ridge):\t{np.mean(scores_ridge)}')
        print(f'Accuracy (GradientBoosting):\t{np.mean(scores_gbc)}')
        print(f'Accuracy (Log):\t{np.mean(scores_log)}')
    scores = {np.mean(scores_rfc):model_rfc, 
            np.mean(scores_gbc):model_gbc, 
            np.mean(scores_ridge):model_ridge, 
            np.mean(scores_log):model_log }    
    return scores[max(scores.keys())].fit(X, y)

def select_features(df, label, model, messages=True):
  from sklearn.feature_selection import SelectFromModel
  import pandas as pd

  y = df[label]
  X = df.drop(columns=[label])
  X = pd.get_dummies(X.copy(), drop_first=True)

  sel = SelectFromModel(model, prefit=True)
  sel.transform(X)

  columns = list(X.columns[sel.get_support()])
  new_df = X[columns]
  new_df[label] = y
  return new_df

def fit_cv_model_expanded(df, label, k=10, r=5, repeat=True, random_state=0):
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
    from sklearn.svm import SVC, LinearSVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neural_network import MLPClassifier
    from xgboost import XGBClassifier

    # Assuming Xandy is a function to split features (X) and target (y)
    X, y = Xandy(df, label)

    if repeat:
        cv = RepeatedKFold(n_splits=k, n_repeats=r, random_state=random_state)
    else:
        cv = KFold(n_splits=k, random_state=random_state, shuffle=True)
    
    fit = {}    # Use this to store each of the fit metrics
    models = {} # Use this to store each of the models

    # Create the model objects
    model_logistic = LogisticRegression(random_state=random_state)
    model_rf = RandomForestClassifier(random_state=random_state)
    model_gb = GradientBoostingClassifier(random_state=random_state)
    model_ab = AdaBoostClassifier(n_estimators=100, random_state=random_state)
    model_svc = SVC(random_state=random_state)
    model_lsvc = LinearSVC(random_state=random_state)
    model_knn = KNeighborsClassifier(n_neighbors=5)
    model_xgb = XGBClassifier(n_estimators=1000, max_depth=7, learning_rate=0.1, subsample=0.7, colsample_bytree=0.8, random_state=random_state)
    model_mlp = MLPClassifier(max_iter=1000, random_state=random_state)

    # Fit a cross-validated accuracy score and add it to the dict
    fit['Logistic Regression'] = np.mean(cross_val_score(model_logistic, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Random Forest'] = np.mean(cross_val_score(model_rf, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Gradient Boosting'] = np.mean(cross_val_score(model_gb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['AdaBoost'] = np.mean(cross_val_score(model_ab, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['SVM'] = np.mean(cross_val_score(model_svc, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Linear SVM'] = np.mean(cross_val_score(model_lsvc, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['KNN'] = np.mean(cross_val_score(model_knn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['XGBoost'] = np.mean(cross_val_score(model_xgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Neural Network'] = np.mean(cross_val_score(model_mlp, X, y, scoring='accuracy', cv=cv, n_jobs=-1))

    # Add the model to another dict; make sure the keys have the same names as the list above
    models['Logistic Regression'] = model_logistic
    models['Random Forest'] = model_rf
    models['Gradient Boosting'] = model_gb
    models['AdaBoost'] = model_ab
    models['SVM'] = model_svc
    models['Linear SVM'] = model_lsvc
    models['KNN'] = model_knn
    models['XGBoost'] = model_xgb
    models['Neural Network'] = model_mlp

    # Add the fit dictionary to a new DataFrame, sort, extract the top row, use it to retrieve the model object from the models dictionary
    df_fit = pd.DataFrame({'Accuracy': fit})
    df_fit.sort_values(by=['Accuracy'], ascending=False, inplace=True)
    best_model = df_fit.index[0]
    

    return models[best_model].fit(X, y)


df = df.sample(n=1000)
#We are not limiting down to CVC and Online only because if there is other types of transactions those need to be known
new_5_percent_df = bin_categories(df.copy(), cutoff=0.05)
label = 'Fraud'
new_5_percent_df = missing_data(new_5_percent_df, label)
model = fit_cv_model(new_5_percent_df, k=5, label=label, messages=False)
df_reduced = select_features(new_5_percent_df.copy(), label=label, model=model, messages=False)
model = fit_cv_model_expanded(df_reduced, k=5, label=label)
X, y = Xandy(df, label)
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]
onnx_model = convert_sklearn(model, initial_types=initial_type)

with open("fraud_onnx_model.onnx", "wb")as f:
    f.write(onnx_model.SerializeToString())


cursor.close()
conn.close()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[label] = y


                     Accuracy
Random Forest          0.9626
Gradient Boosting      0.9618
XGBoost                0.9562
AdaBoost               0.9542
Logistic Regression    0.9510
SVM                    0.9380
KNN                    0.9342
Linear SVM             0.9004
Neural Network         0.7956
RandomForestClassifier(random_state=0)
