In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [2]:
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('./Data/ModelData/train_data.csv')
df.drop(columns='Unnamed: 0', inplace=True)
y = df['price']
X = df.drop(columns='price')

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   body_type      1000000 non-null  object 
 1   has_accidents  1000000 non-null  bool   
 2   make_name      1000000 non-null  object 
 3   mileage        898649 non-null   float64
 4   model_name     1000000 non-null  object 
 5   owner_count    466610 non-null   float64
 6   transmission   1000000 non-null  object 
 7   year           1000000 non-null  int64  
dtypes: bool(1), float64(2), int64(1), object(4)
memory usage: 54.4+ MB


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   body_type      1000000 non-null  object 
 1   has_accidents  1000000 non-null  bool   
 2   make_name      1000000 non-null  object 
 3   mileage        898649 non-null   float64
 4   model_name     1000000 non-null  object 
 5   owner_count    466610 non-null   float64
 6   price          1000000 non-null  float64
 7   transmission   1000000 non-null  object 
 8   year           1000000 non-null  int64  
dtypes: bool(1), float64(3), int64(1), object(4)
memory usage: 62.0+ MB


In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [6]:
s = (X.dtypes == 'object')
object_cols = list(s[s].index)

In [7]:
from sklearn.metrics import r2_score

numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

my_cols = categorical_cols + numerical_cols

X_train_edit = X_train[my_cols].copy()
X_valid_edit = X_valid[my_cols].copy()

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

model = RandomForestRegressor(n_estimators=150, random_state=1)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_valid)

r2_score(y_valid, preds)

0.9186381574952138

In [18]:
import joblib

filename = 'SUV_model.pk1'
joblib.dump(pipeline, filename, compress=1)

['SUV_model.pk1']