In [13]:
!pip install mlflow

Collecting mlflow
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/48/6c/fbbdcfae990a13006da012f2af459f4c76dc9d07160782a108a524825e88/mlflow-2.7.1-py3-none-any.whl.metadata
[0m  Downloading mlflow-2.7.1-py3-none-any.whl.metadata (12 kB)
Collecting cloudpickle<3 (from mlflow)
  Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting databricks-cli<1,>=0.8.7 (from mlflow)
  Downloading databricks-cli-0.17.7.tar.gz (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m838.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting entrypoints<1 (from mlflow)
  Downloading entrypoints-0.4-py3-none-any.whl (5.3 kB)
Collecting sqlparse<1,>=0.4.0 (from mlflow)
  Downloading sqlparse-0.4.4-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollect

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor

In [17]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

from urllib.parse import urlparse

In [2]:
path = "../data/MPS/agg_filtered_boroughs_weekends.csv"

In [3]:
df = pd.read_csv(path)
df.TimePeriod = pd.to_datetime(df.TimePeriod)

In [4]:
df.head()

Unnamed: 0,BoroughName,CrimeType,CrimeSubtype,TimePeriod,Value,WorkingDays,Weekends
0,Barking and Dagenham,Arson and Criminal Damage,Arson,2010-04-01,6,20,10
1,Barking and Dagenham,Arson and Criminal Damage,Criminal Damage,2010-04-01,208,20,10
2,Barking and Dagenham,Burglary,Burglary Business and Community,2010-04-01,49,20,10
3,Barking and Dagenham,Burglary,Domestic Burglary,2010-04-01,118,20,10
4,Barking and Dagenham,Drug Offences,Drug Trafficking,2010-04-01,9,20,10


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266921 entries, 0 to 266920
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   BoroughName   266921 non-null  object        
 1   CrimeType     266921 non-null  object        
 2   CrimeSubtype  266921 non-null  object        
 3   TimePeriod    266921 non-null  datetime64[ns]
 4   Value         266921 non-null  int64         
 5   WorkingDays   266921 non-null  int64         
 6   Weekends      266921 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 14.3+ MB


# Transorm dataframe

### Make dict vectorizer for BoroughName and CrimeSubtype, date features

In [6]:
def makeX(df):
    df_new = df[['WorkingDays', 'Weekends']].copy()
    df_new["year"] = df.TimePeriod.dt.year
    df_new["month"] = df.TimePeriod.dt.month

    vec = DictVectorizer()

    # Convert the selected columns to dictionary and vectorize
    df_dict = df[['BoroughName', 'CrimeSubtype']].to_dict('records')
    vectorized_data = vec.fit_transform(df_dict).toarray()
    df_vectorized = pd.DataFrame(vectorized_data, columns=vec.feature_names_)
    
    df_new = pd.concat([df_new, df_vectorized], axis=1)
    
    return df_new

### Prepare X and y

In [7]:
X = makeX(df)
y = df['Value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [9]:
X.head()

Unnamed: 0,WorkingDays,Weekends,year,month,BoroughName=Barking and Dagenham,BoroughName=Barnet,BoroughName=Bexley,BoroughName=Brent,BoroughName=Bromley,BoroughName=Camden,...,CrimeSubtype=Shoplifting,CrimeSubtype=Soliciting for Prostitution,CrimeSubtype=Theft from Person,CrimeSubtype=Theft from a Motor Vehicle,CrimeSubtype=Theft or Taking of a Motor Vehicle,CrimeSubtype=Threat or Possession With Intent to Commit Crimina,CrimeSubtype=Violence with Injury,CrimeSubtype=Violence without Injury,CrimeSubtype=Violent Disorder,CrimeSubtype=Wildlife Crime
0,20,10,2010,4,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20,10,2010,4,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20,10,2010,4,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20,10,2010,4,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20,10,2010,4,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train the model

In [10]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [11]:
alpha = 0.5
l1_ratio = 0.5

In [12]:
%%time
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
lr.fit(X_train, y_train)

predicted_qualities = lr.predict(X_test)

(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

print(f"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R2: {r2}")

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 97.61267123552098
  MAE: 52.71150117353197
  R2: 0.06709294659355103
CPU times: user 496 ms, sys: 1.24 s, total: 1.73 s
Wall time: 460 ms


### Second model

In [60]:
%%time

forest = RandomForestRegressor(10, random_state=42)
forest.fit(X_train, y_train)

predicted_qualities2 = forest.predict(X_test)

(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities2)

print(f"Random forest:")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R2: {r2}")

Random forest:
  RMSE: 46.733928774680045
  MAE: 11.982345228060316
  R2: 0.7861592022463404
CPU times: user 31.3 s, sys: 57.8 ms, total: 31.3 s
Wall time: 31.3 s


In [63]:
# Посмотрим на наши предсказания
filter = y_test > 20
pd.DataFrame({
    "fact": y_test[filter], 
    "predict lr": predicted_qualities[filter].round(2), 
    "predict_rf": predicted_qualities2[filter].round(2),
}).head(20)

Unnamed: 0,fact,predict lr,predict_rf
213541,21,39.57,14.1
213558,98,41.86,110.2
213559,22,40.58,25.1
213562,101,42.75,120.5
213563,42,41.1,42.6
213565,22,41.22,18.9
213566,231,58.95,231.9
213567,72,45.13,92.8
213568,69,45.0,53.2
213570,67,41.14,80.4


# Working with mlflow

In [19]:
alpha = 0.2
l1_ratio = 0.8

with mlflow.start_run():
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(X_train, y_train)
    
    predicted_qualities = lr.predict(X_test)
    
    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
    
    print(f"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):")
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    predictions = lr.predict(X_train)
    signature = infer_signature(X_train, predictions)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":
        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(
            lr, "model", registered_model_name="ElasticnetWineModel", signature=signature
        )
    else:
        mlflow.sklearn.log_model(lr, "model", signature=signature)

Elasticnet model (alpha=0.200000, l1_ratio=0.800000):
  RMSE: 85.25666860300342
  MAE: 42.12914178115747
  R2: 0.28832338017957837


  inputs = _infer_schema(model_input) if model_input is not None else None
