In [15]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from urllib.parse import urlparse
from mlflow.tracking import MlflowClient
import mlflow.sklearn
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository

In [16]:
# read data
df_origin = pd.read_csv("https://raw.githubusercontent.com/KuserOguzHan/mlops_1/main/hepsiburada.csv.csv")
df_origin.head()

Unnamed: 0,manufacturer,memory,ram,screen_size,power,front_camera,rc1,rc3,rc5,rc7,price
0,Samsung,64.0,4.0,6.5,5000.0,8.0,48.0,2.0,2.0,2.0,2.999
1,Samsung,128.0,4.0,6.5,5000.0,8.0,48.0,2.0,2.0,2.0,3.249
2,Oppo,64.0,4.0,6.52,4230.0,8.0,13.0,2.0,2.0,0.0,2.749
3,Oppo,128.0,8.0,6.4,4310.0,32.0,48.0,8.0,2.0,2.0,4.655
4,Oppo,128.0,4.0,6.43,5000.0,16.0,48.0,2.0,0.0,0.0,3.984


In [17]:
df = df_origin.drop(["manufacturer"], axis=1)

In [18]:
df.head()

Unnamed: 0,memory,ram,screen_size,power,front_camera,rc1,rc3,rc5,rc7,price
0,64.0,4.0,6.5,5000.0,8.0,48.0,2.0,2.0,2.0,2.999
1,128.0,4.0,6.5,5000.0,8.0,48.0,2.0,2.0,2.0,3.249
2,64.0,4.0,6.52,4230.0,8.0,13.0,2.0,2.0,0.0,2.749
3,128.0,8.0,6.4,4310.0,32.0,48.0,8.0,2.0,2.0,4.655
4,128.0,4.0,6.43,5000.0,16.0,48.0,2.0,0.0,0.0,3.984


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122 entries, 0 to 1121
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   memory        1122 non-null   float64
 1   ram           1122 non-null   float64
 2   screen_size   1122 non-null   float64
 3   power         1122 non-null   float64
 4   front_camera  1122 non-null   float64
 5   rc1           1122 non-null   float64
 6   rc3           1122 non-null   float64
 7   rc5           1122 non-null   float64
 8   rc7           1122 non-null   float64
 9   price         1122 non-null   float64
dtypes: float64(10)
memory usage: 87.8 KB


In [20]:
# Feature matrix
X = df.iloc[:, 0:-1].values
print(X.shape)
print(X[:3])

(1122, 9)
[[6.40e+01 4.00e+00 6.50e+00 5.00e+03 8.00e+00 4.80e+01 2.00e+00 2.00e+00
  2.00e+00]
 [1.28e+02 4.00e+00 6.50e+00 5.00e+03 8.00e+00 4.80e+01 2.00e+00 2.00e+00
  2.00e+00]
 [6.40e+01 4.00e+00 6.52e+00 4.23e+03 8.00e+00 1.30e+01 2.00e+00 2.00e+00
  0.00e+00]]


In [21]:
# Output variable
y = df.iloc[:, -1]
print(y.shape)
print(y[:6])

(1122,)
0    2.999
1    3.249
2    2.749
3    4.655
4    3.984
5    5.349
Name: price, dtype: float64


In [22]:
# split test train
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

MLFLOW

In [24]:
os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000/'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000/'

In [25]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [26]:
# Error_1: If you encounter "experiment name" error, you have to alter "experiment_name"

experiment_name = "FastAPI with MLflow_1"
mlflow.set_experiment(experiment_name)

registered_model_name="hepsiburadaRFModel"

In [27]:
number_of_trees=200

In [28]:
with mlflow.start_run(run_name="with-reg-rf-sklearn") as run:
        estimator = RandomForestRegressor(n_estimators=number_of_trees)
        estimator.fit(X_train, y_train)

        y_pred = estimator.predict(X_test)

        (rmse, mae, r2) = eval_metrics(y_test, y_pred)

        print(f"Random Forest model number of trees: {number_of_trees}")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("n_estimators", number_of_trees)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.sklearn.log_model(estimator, "model", registered_model_name=registered_model_name)
        else:
            mlflow.sklearn.log_model(estimator, "model")

Random Forest model number of trees: 200
  RMSE: 1.4476128566874127
  MAE: 0.5707388547447533
  R2: 0.9287894569397817


Registered model 'hepsiburadaRFModel' already exists. Creating a new version of this model...
2023/03/09 14:41:33 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: hepsiburadaRFModel, version 6
Created version '6' of model 'hepsiburadaRFModel'.
