In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [13]:
df = pd.read_csv("/Users/antoinebertin/Documents/jedha/full_stack/projects_full_stack/deploy_ml/get_around_pricing_project.csv")

In [14]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [15]:
# Identify rows where 'model_key' appears only once
counts = df['model_key'].value_counts()
single_keys = counts[counts == 1].index

# Select rows where 'model_key' is in single_keys
single_rows = df[df['model_key'].isin(single_keys)]

# Append these rows to the DataFrame
df = pd.concat([df, single_rows])

In [16]:
df['model_key'].groupby(df['model_key']).count().sort_values(ascending=False)

model_key
Citroën        969
Renault        916
BMW            827
Peugeot        642
Audi           526
Nissan         275
Mitsubishi     231
Mercedes        97
Volkswagen      65
Toyota          53
SEAT            46
Subaru          44
Opel            33
Ferrari         33
PGO             33
Maserati        18
Suzuki           8
Porsche          6
Ford             5
KIA Motors       3
Alfa Romeo       3
Mini             2
Mazda            2
Lexus            2
Lamborghini      2
Honda            2
Fiat             2
Yamaha           2
Name: model_key, dtype: int64

In [17]:
bool_columns = df.select_dtypes(include=['bool']).columns
df[bool_columns] = df[bool_columns].astype(int)

In [18]:
X = df.drop('rental_price_per_day', axis = 1)
y = df.loc[:,'rental_price_per_day']

In [19]:
df['car_type'].groupby(df['car_type']).count()

car_type
convertible      47
coupe           104
estate         1606
hatchback       699
sedan          1171
subcompact      117
suv            1058
van              45
Name: car_type, dtype: int64

In [20]:
X.columns

Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires'],
      dtype='object')

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True, stratify=X['model_key'])

In [40]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

In [41]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [42]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [37]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [114]:
regressor = RandomForestRegressor() # we must use a regressor here!
regressor.fit(X_train, Y_train)

In [115]:
regressor.score(X_test, Y_test)

0.788461710211017

In [10]:
from joblib import dump, load

In [118]:
# Assuming 'regressor' is your model
#dump(regressor, 'model.joblib')
#dump(preprocessor, 'preprocessor.joblib')

['preprocessor.joblib']

# test user input processor

In [50]:
preprocessor = load('/Users/antoinebertin/Documents/jedha/full_stack/projects_full_stack/deploy_ml/preprocessor.joblib')
model = load('/Users/antoinebertin/Documents/jedha/full_stack/projects_full_stack/deploy_ml/model.joblib')

In [86]:
# Index(['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
#        'car_type', 'private_parking_available', 'has_gps',
#        'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
#        'has_speed_regulator', 'winter_tires'],
#       dtype='object')

# array(['Renault', 186382, 120, 'diesel', 'silver', 'estate', 1, 1, 0, 0,
#        0, 0, 1], dtype=object)


user_test = ['Renault', 186382, 120, 'diesel', 'silver', 'estate', 1, 1, 0, 0,0, 0, 1]
features = ['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']
len(user_test), len(features)

(13, 13)

In [90]:
data = pd.DataFrame([user_test], columns=features)

In [95]:
transformed_data = preprocessor.transform(data)
prediction = model.predict(transformed_data)

In [96]:
prediction

array([106.57])