In [1]:
import os
import mlflow
import requests
import numpy as np
import pandas as pd
import mysql.connector
import joblib

from sklearn.metrics import f1_score, accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

In [4]:
conn = mysql.connector.connect(
    host="mysql",
    user="airflow",
    password="airflow",
    database="airflow"
)

select_query = """
SELECT
*
FROM
    data_table
"""

df = pd.read_sql(select_query, con=conn)
conn.close()

  df = pd.read_sql(select_query, con=conn)


In [5]:
df
columns_to_convert = df.columns.difference(['Wilderness_Area', 'Soil_Type'])
df[columns_to_convert] = df[columns_to_convert].astype(float)
df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2971.0,84.0,2.0,124.0,2.0,2520.0,222.0,235.0,149.0,1600.0,Rawah,C7745,0.0
1,2930.0,247.0,2.0,175.0,23.0,5050.0,215.0,240.0,162.0,1731.0,Rawah,C7202,4.0
2,3119.0,331.0,16.0,212.0,26.0,3020.0,181.0,219.0,176.0,3308.0,Rawah,C7746,0.0
3,2780.0,81.0,15.0,175.0,16.0,1621.0,239.0,212.0,99.0,1994.0,Rawah,C7745,1.0
4,2925.0,125.0,13.0,85.0,16.0,3546.0,242.0,230.0,114.0,2270.0,Rawah,C4744,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52285,2900.0,147.0,18.0,60.0,5.0,3200.0,241.0,235.0,116.0,1532.0,Rawah,C7745,0.0
52286,3116.0,11.0,12.0,210.0,23.0,2962.0,207.0,214.0,144.0,3360.0,Rawah,C7746,0.0
52287,2851.0,159.0,4.0,30.0,0.0,2845.0,224.0,240.0,151.0,2591.0,Rawah,C7202,1.0
52288,2930.0,7.0,18.0,170.0,26.0,4310.0,195.0,202.0,141.0,395.0,Rawah,C7745,0.0


In [6]:
100 * df.isna().sum()/len(df)

Elevation                             0.0
Aspect                                0.0
Slope                                 0.0
Horizontal_Distance_To_Hydrology      0.0
Vertical_Distance_To_Hydrology        0.0
Horizontal_Distance_To_Roadways       0.0
Hillshade_9am                         0.0
Hillshade_Noon                        0.0
Hillshade_3pm                         0.0
Horizontal_Distance_To_Fire_Points    0.0
Wilderness_Area                       0.0
Soil_Type                             0.0
Cover_Type                            0.0
dtype: float64

In [7]:
df_no_duplicates = df.drop_duplicates()

In [8]:
df_no_duplicates

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2971.0,84.0,2.0,124.0,2.0,2520.0,222.0,235.0,149.0,1600.0,Rawah,C7745,0.0
1,2930.0,247.0,2.0,175.0,23.0,5050.0,215.0,240.0,162.0,1731.0,Rawah,C7202,4.0
2,3119.0,331.0,16.0,212.0,26.0,3020.0,181.0,219.0,176.0,3308.0,Rawah,C7746,0.0
3,2780.0,81.0,15.0,175.0,16.0,1621.0,239.0,212.0,99.0,1994.0,Rawah,C7745,1.0
4,2925.0,125.0,13.0,85.0,16.0,3546.0,242.0,230.0,114.0,2270.0,Rawah,C4744,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52270,3089.0,305.0,13.0,362.0,49.0,4987.0,184.0,233.0,189.0,540.0,Rawah,C7745,1.0
52274,3063.0,7.0,6.0,210.0,18.0,3310.0,213.0,229.0,153.0,1425.0,Rawah,C7202,0.0
52278,3113.0,45.0,7.0,30.0,4.0,3901.0,222.0,225.0,138.0,1650.0,Rawah,C7202,1.0
52283,2922.0,106.0,15.0,306.0,40.0,3499.0,244.0,221.0,101.0,2969.0,Rawah,C7202,1.0


In [9]:
features = list(df_no_duplicates.columns[:-1])

In [10]:
# Set the target values
y = df_no_duplicates['Cover_Type']#.values

# Set the input values
X = df_no_duplicates[features]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                        ["Wilderness_Area", "Soil_Type"]),
                                      remainder='passthrough') # pass all the numeric values through the pipeline without any changes.

column_trans

In [12]:
pipe = Pipeline(steps=[("column_trans", column_trans),("scaler", StandardScaler(with_mean=False)), ("RandomForestClassifier", RandomForestClassifier())])

pipe

In [13]:
param_grid =  {'RandomForestClassifier__max_depth': [1,2,3,10], 'RandomForestClassifier__n_estimators': [10,11]}

search = GridSearchCV(pipe, param_grid, n_jobs=2)
search

In [14]:
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://minio:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.151:8087")
mlflow.set_experiment("mlflow_tracking_examples")

mlflow.sklearn.autolog(log_model_signatures=True, log_input_examples=True, registered_model_name="modelo_base")

with mlflow.start_run(run_name="autolog_pipe_model_reg") as run:
    search.fit(X_train, y_train)