In [83]:
from urllib.parse import urlparse

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, classification_report)
import mlflow
import mlflow.sklearn

In [84]:
# http://16.171.23.137:5000

In [85]:
mlflow.set_tracking_uri("http://16.171.23.137:5000")
mlflow.set_experiment('taras')
# mlflow.set_tracking_uri("sqlite:///mlflow.db")
# mlflow.set_experiment("taras_sqlite")

<Experiment: artifact_location='s3://awsbucketformlflow/709931596279444773', creation_time=1693905531961, experiment_id='709931596279444773', last_update_time=1693905531961, lifecycle_stage='active', name='taras', tags={}>

In [86]:
tracking_uri = mlflow.get_tracking_uri()

In [87]:
tracking_uri

'http://16.171.23.137:5000'

In [88]:
data = pd.read_csv("data/aug_train.csv")
targets = data[["target"]]
data.drop(["enrollee_id", "target"], inplace=True, axis=1)

In [89]:
categorical_features = []
numerical_features = []

for column in data.columns:
    dtype = str(data[column].dtype)
    if dtype in ["float64", "int64"]:
        numerical_features.append(column)
    else:
        categorical_features.append(column)

In [90]:
for categorical_feature in categorical_features:
    data[categorical_feature].fillna('missing', inplace=True)

In [91]:
for categorical_feature in categorical_features:
    le = LabelEncoder()
    data[categorical_feature] = le.fit_transform(data[categorical_feature])

In [92]:
x_train, x_test, y_train, y_test = train_test_split(data.values, 
                                                    targets.values.ravel(), 
                                                    test_size=0.3, 
                                                    random_state=2021,
                                                    stratify=targets.values)

In [93]:
print(x_train.shape, x_test.shape)

(13410, 12) (5748, 12)


In [94]:
print(y_train.shape, y_test.shape)

(13410,) (5748,)


In [95]:
with mlflow.start_run(run_name='taras_test'):
    class_weight = "balanced"
    max_iter = 1500
    solver = 'lbfgs'
    # solver{‘lbfgs’, ‘liblinear’, ‘newton-cg’, ‘newton-cholesky’, ‘sag’, ‘saga’}

    logistic_regression = LogisticRegression(class_weight=class_weight, max_iter=max_iter, solver=solver)
    logistic_regression.fit(x_train, y_train)

    y_pred = logistic_regression.predict(x_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    mlflow.log_param("class_weight", class_weight)
    mlflow.log_param("max_iter", max_iter)
    mlflow.log_param("solver", solver)
    
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("auc", auc) 
    
    mlflow.sklearn.log_model(logistic_regression, "taras_test")



NoCredentialsError: Unable to locate credentials