In [29]:
import os
import mlflow
from mlflow import log_metric, log_param, log_artifacts
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
import pickle
import numpy as np
import pandas as pd

In [30]:
# Set tracking URI to your Heroku application

os.environ["APP_URI"] = "https://gamlflow-073392a95095.herokuapp.com/"
mlflow.set_tracking_uri(os.environ["APP_URI"])


In [31]:
data = pd.read_csv("./get_around_pricing_project.csv")

In [32]:
data = data.drop("Unnamed: 0", axis=1)
data.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [33]:
print(data.shape)
data.info()

(4843, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   model_key                  4843 non-null   object
 1   mileage                    4843 non-null   int64 
 2   engine_power               4843 non-null   int64 
 3   fuel                       4843 non-null   object
 4   paint_color                4843 non-null   object
 5   car_type                   4843 non-null   object
 6   private_parking_available  4843 non-null   bool  
 7   has_gps                    4843 non-null   bool  
 8   has_air_conditioning       4843 non-null   bool  
 9   automatic_car              4843 non-null   bool  
 10  has_getaround_connect      4843 non-null   bool  
 11  has_speed_regulator        4843 non-null   bool  
 12  winter_tires               4843 non-null   bool  
 13  rental_price_per_day       4843 non-null   int64 
dt

In [34]:
data.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [35]:
data["has_gps"].value_counts()

has_gps
True     3839
False    1004
Name: count, dtype: int64

In [36]:
scarce_models = ["Suzuki", "Porsche","Ford", "KIA Motors", "Alfa Romeo", "Fiat", "Lexus", "Lamborghini", "Mazda", "Honda", "Mini", "Yamaha"]
data["model_key"] = data["model_key"].apply(lambda x: "Other" if x in scarce_models else x)

In [37]:
y = data["rental_price_per_day"]
X = data.drop(["rental_price_per_day"], axis=1)

In [38]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer( transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [40]:
EXPERIMENT_NAME = "batch-experiment"

mlflow.set_experiment(EXPERIMENT_NAME)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)


mlflow.sklearn.autolog()

In [41]:
regressor = LinearRegression()
with mlflow.start_run(experiment_id=experiment.experiment_id, run_name="LinearReg"):
    regressor.fit(X_train, y_train)
    
    y_train_pred = regressor.predict(X_train)
    y_test_pred = regressor.predict(X_test)

    mlflow.log_metric("Train R2 score", r2_score(y_train, y_train_pred))
    mlflow.log_metric("Test R2 score", r2_score(y_test, y_test_pred))

mlflow.end_run()



In [42]:
import plotly.express as px
names = [x[5:] for x in preprocessor.get_feature_names_out().tolist()]

coeffs = pd.DataFrame(index = names, data = regressor.coef_.transpose(), columns=['coefficients'])
feature_importance = coeffs.sort_values(by='coefficients')

fig = px.bar(feature_importance)
fig.update_layout(showlegend = False,
                  margin = {'l': 120}
                 )

fig.update_layout(autosize = False, height = 600, width = 1200)
fig.show() 

In [43]:
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print("Linear regression:")
print(f"R2 score on training set: {regressor.score(X_train,y_train)}")
print(f"R2 score on test set: {regressor.score(X_test,y_test)}")
print('Cross Validation standard deviation is : ', scores.std())

Linear regression:
R2 score on training set: 0.7126313771835564
R2 score on test set: 0.6895248080041396
Cross Validation standard deviation is :  0.043765173082865295


In [44]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([("prep", preprocessor),("reg", regressor)])

filename = 'model.pkl'
with open (filename, 'wb') as picklefile:
    pickle.dump(pipe, picklefile)