In [15]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: joblib, imbalanced-learn, imblearn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0 joblib-1.2.0
[0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(re

In [9]:
#Carga de datos

def load_data():
    
    from preparation.preparation import load_file_card
    
    ''' Defined Load File '''
    model_data = load_file_card().copy()
    
    ''' Preparin data for analytic model '''
    x = model_data.drop("fraud", axis = 1).values
    y = model_data["fraud"].values
    
    return x, y

In [10]:
#Particionamiento de datos

def make_train_test_split(x, y):
    
    import pandas as pd
    from imblearn.over_sampling import SMOTE
    from sklearn.model_selection import train_test_split
    
    smote = SMOTE(random_state=39)
    non_fraud_over, fraud_over = smote.fit_resample(x, y)

    non_fraud_over_df = pd.DataFrame(non_fraud_over, columns=["distance_from_home", "distance_from_last_transaction",
        "ratio_to_median_purchase_price", "repeat_retailer", "used_chip",
        "used_pin_number", "online_order"])

    non_fraud_over_df["fraud"] = fraud_over
    df3 = non_fraud_over_df

    feature_columns = ["distance_from_home", "distance_from_last_transaction",
    "ratio_to_median_purchase_price", "repeat_retailer", "used_chip", "used_pin_number", "online_order"]

    X_smote = df3[feature_columns]
    y_smote = df3.fraud

    X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.2, random_state=39)
    
    return X_train_smote, X_test_smote, y_train_smote, y_test_smote


In [43]:
#Calculo de metricas

def eval_metrics(y_test_smote, y_pred_logreg_smote):
    
    from sklearn.metrics import classification_report
    from sklearn.metrics import confusion_matrix
    
    #confusion matrix
    confusion_matrix_logreg = confusion_matrix(y_test_smote, y_pred_logreg_smote)
    
    #classification report
    classification_report_logreg = classification_report(y_test_smote, y_pred_logreg_smote, digits=6)
    
    return confusion_matrix_logreg, classification_report_logreg    

In [44]:
#Reporte de métricas

def report(confusion_matrix_logreg, classification_report_logreg):
    
    print(f"Confusion matrix Logistic Regression: {confusion_matrix_logreg}")
    print(f"classification report RL: {classification_report_logreg}")

In [45]:
#Entrenamiento modelo

def train_logreg (max_iter = 200):
    
    import mlflow.sklearn
    import mlflow    
    from sklearn.linear_model import LogisticRegression
    
    x, y = load_data()
    
    X_train_smote, X_test_smote, y_train_smote, y_test_smote = make_train_test_split(x, y)
    
    print('Tracking directory:', mlflow.get_tracking_uri())
    
    with mlflow.start_run(run_name = "LogReg_Model") as run:
    
        logreg = LogisticRegression(max_iter=max_iter)
        logreg.fit(X_train_smote, y_train_smote)

        y_pred_logreg_smote = logreg.predict(X_test_smote)
        
        
        confusion_matrix_logreg, classification_report_logreg = eval_metrics(y_test_smote, y_pred_logreg_smote)
        
        report(confusion_matrix_logreg, classification_report_logreg)
        
        #
        # Tracking de parámetros
        #
        mlflow.log_param("max_iter", max_iter)

        #
        # Tracking de metricas
        #
        mlflow.log_metric("accuracy_logreg", logreg.score(X_test_smote, y_test_smote))
        #mlflow.log_metric("classification_report_logreg", classification_report_logreg)

        #
        # Tracking del modelo
        #
        #mlflow.sklearn.log_model(logreg, "model")
        
        return (run.info.experiment_id, run.info.run_id)

In [46]:
#Corridas
train_logreg()

Tracking directory: file:///workspace/src/mlruns
Confusion matrix Logistic Regression: [[170558  12121]
 [  9170 173190]]
classification report RL:               precision    recall  f1-score   support

         0.0   0.948978  0.933649  0.941251    182679
         1.0   0.934591  0.949715  0.942092    182360

    accuracy                       0.941675    365039
   macro avg   0.941785  0.941682  0.941672    365039
weighted avg   0.941791  0.941675  0.941671    365039



('0', 'd45ec45a18554895aec385db3e639489')

In [47]:
train_logreg(300)

Tracking directory: file:///workspace/src/mlruns
Confusion matrix Logistic Regression: [[170558  12121]
 [  9170 173190]]
classification report RL:               precision    recall  f1-score   support

         0.0   0.948978  0.933649  0.941251    182679
         1.0   0.934591  0.949715  0.942092    182360

    accuracy                       0.941675    365039
   macro avg   0.941785  0.941682  0.941672    365039
weighted avg   0.941791  0.941675  0.941671    365039



('0', '751ddc4fcf574d339193df6ae51e8c7e')

In [48]:
train_logreg(500)

Tracking directory: file:///workspace/src/mlruns
Confusion matrix Logistic Regression: [[170558  12121]
 [  9170 173190]]
classification report RL:               precision    recall  f1-score   support

         0.0   0.948978  0.933649  0.941251    182679
         1.0   0.934591  0.949715  0.942092    182360

    accuracy                       0.941675    365039
   macro avg   0.941785  0.941682  0.941672    365039
weighted avg   0.941791  0.941675  0.941671    365039



('0', '08316c1316ed4284887e5097e74d9229')

In [49]:
train_logreg(5000)

Tracking directory: file:///workspace/src/mlruns
Confusion matrix Logistic Regression: [[170558  12121]
 [  9170 173190]]
classification report RL:               precision    recall  f1-score   support

         0.0   0.948978  0.933649  0.941251    182679
         1.0   0.934591  0.949715  0.942092    182360

    accuracy                       0.941675    365039
   macro avg   0.941785  0.941682  0.941672    365039
weighted avg   0.941791  0.941675  0.941671    365039



('0', '5c85417c2539483686b9b77dd303d797')