In [52]:
from urllib.parse import urlparse

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, classification_report)
import mlflow
import mlflow.sklearn

In [53]:
# http://16.171.23.137:5000

In [54]:
mlflow.set_tracking_uri("http://16.171.23.137:5000")

In [55]:
tracking_uri = mlflow.get_tracking_uri()

In [56]:
tracking_uri

'file:///C:/Users/eiGroup%20-%20Nadir/Desktop/neptune/mlruns'

In [57]:
data = pd.read_csv("data/aug_train.csv")
targets = data[["target"]]
data.drop(["enrollee_id", "target"], inplace=True, axis=1)

In [58]:
categorical_features = []
numerical_features = []

for column in data.columns:
    dtype = str(data[column].dtype)
    if dtype in ["float64", "int64"]:
        numerical_features.append(column)
    else:
        categorical_features.append(column)

In [59]:
for categorical_feature in categorical_features:
    data[categorical_feature].fillna('missing', inplace=True)

In [60]:
for categorical_feature in categorical_features:
    le = LabelEncoder()
    data[categorical_feature] = le.fit_transform(data[categorical_feature])

In [61]:
x_train, x_test, y_train, y_test = train_test_split(data.values, 
                                                    targets.values.ravel(), 
                                                    test_size=0.3, 
                                                    random_state=2021,
                                                    stratify=targets.values)

In [62]:
print(x_train.shape, x_test.shape)

(13410, 12) (5748, 12)


In [63]:
print(y_train.shape, y_test.shape)

(13410,) (5748,)


In [64]:
with mlflow.start_run():
    class_weight = "balanced"
    max_iter = 1000

    logistic_regression = LogisticRegression(class_weight=class_weight, max_iter=max_iter)
    logistic_regression.fit(x_train, y_train)

    y_pred = logistic_regression.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    mlflow.log_param("class_weight", class_weight)
    mlflow.log_param("max_iter", max_iter)
    
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("auc", auc) 
    
    mlflow.sklearn.log_model(logistic_regression, "model")

S3UploadFailedError: Failed to upload C:\Users\EIGROU~1\AppData\Local\Temp\tmpz4x3f630\model\conda.yaml to awsbucketformlflow/b1f546b0690c420b8f3860fbcdcde016/artifacts/model/conda.yaml: An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.