In [105]:
import pandas as pd
import numpy as np
import warnings


import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    MinMaxScaler,
    StandardScaler,
    PowerTransformer,
    FunctionTransformer
)

from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
    MeanEncoder,
    RareLabelEncoder,
    CountFrequencyEncoder
)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV
)
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
import pickle
import joblib


### Display Options

In [20]:
pd.set_option("display.max_columns", None)

In [21]:
sklearn.set_config(transform_output="pandas")

In [22]:
warnings.filterwarnings("ignore")

### Read the Data

In [23]:
train = pd.read_csv("../data/train_preprocessed_data.csv")

In [24]:
train.dtypes

airline__airline_Air India                           float64
airline__airline_Indigo                              float64
airline__airline_Jet Airways                         float64
airline__airline_Multiple Carriers                   float64
airline__airline_other                               float64
doj__date_of_journey_month                           float64
doj__date_of_journey_week                            float64
doj__date_of_journey_day_of_week                     float64
doj__date_of_journey_day_of_year                     float64
location__source                                     float64
location__destination                                float64
location__source_is_north                              int64
location__destination_is_north                         int64
time__dep_time_hour                                  float64
time__dep_time_minute                                float64
time__arrival_time_hour                              float64
time__arrival_time_minut

In [25]:
X = train.drop(columns="price")
y = train["price"]

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [27]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5356, 30)
(1339, 30)
(5356,)
(1339,)


### XGB

In [106]:
xgb = XGBRegressor(n_estimators=500)

xgb.fit(X_train,y_train)

y_pred = xgb.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(y_test,y_pred))

Mean Squared Error: 2759650.4683586718
Mean Absolute Error: 750.7976141681058
0.8596326872617226


### Random Forest

In [107]:
rf = RandomForestRegressor(n_estimators=500)

rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(y_test,y_pred))

Mean Squared Error: 3167834.2561542597
Mean Absolute Error: 729.3150694228102
0.8388707603245495


### extra trees regressor

In [135]:
extra_trees = ExtraTreesRegressor(n_estimators=500)

# Step 4: Fit the model on the training data
extra_trees.fit(X_train, y_train)

# Step 5: Make predictions on the testing data
y_pred = extra_trees.predict(X_test)

# Step 6: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("r2", r2_score(y_test,y_pred))


Mean Squared Error: 2509652.7004508553
Mean Absolute Error: 664.7408670649738
r2 0.8723486146134422


### saving the random forest model

In [140]:
with open("../randomforest-model.joblib","wb") as file:
    joblib.dump(rf, file)

In [141]:
with open("../extratrees-model.joblib","wb") as file:
    joblib.dump(extra_trees, file)

### preprocessing code

In [40]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("groupes",RareLabelEncoder(tol=0.1,replace_with="other",n_categories=2)),
    ("encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])

# date_of_journey
features_to_extract = ["month","week","day_of_week","day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=features_to_extract,yearfirst=True)),
    ("scaler",MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("groups",RareLabelEncoder(tol=0.1,replace_with="other",n_categories=2)),
    ("encoder",MeanEncoder()),
    ("scaler",PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi","Kolkata","Mumbai","New Delhi"]
    
    return(
        X
        .assign(**{
            f"{col}_is_north":X.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("pipe1",location_pipe1),
    ("pipe2",FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler",MinMaxScaler())
])

def part_of_day(X,morning=4, noon=12, evening=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col:pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day":np.select(
                [X_temp.loc[:,col].between(morning,noon,inclusive="left"),
                 X_temp.loc[:,col].between(noon,evening,inclusive="left"),
                 X_temp.loc[:,col].between(evening,night,inclusive="left")],
                ["morning","afternoon","evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part_of_day",FunctionTransformer(func=part_of_day)),
    ("encoder",CountFrequencyEncoder()),
    ("scaler",MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("pipe1",time_pipe1),
    ("pipe2",time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None, percentiles=[0.25,0.5,0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma
    
    def fit(self,X,y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()
        
        self.reference_values_ = {
            col: ( 
                X
                .loc[:,col]
                .quantile(self.percentiles)
                .values
                .reshape(-1,1)
            )
            for col in self.variables
        }
        return self
    
    def transform(self,X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbg_{int(percentile*100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data = rbf_kernel(X.loc[:,[col]],Y=self.reference_values_[col],gamma=self.gamma),
                columns = columns
            )
            objects.append(obj)
        return pd.concat(objects,axis=1)

def duration_category(X, short=180, medium=400):
    return (
        X
        .assign(duration_cat = np.select([X.duration.lt(short),
                                         X.duration.between(short, medium, inclusive="left")],
                                         ["short","medium"],
                                         default="long"))
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf",RBFPercentileSimilarity()),
    ("scaler",PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("category",FunctionTransformer(func=duration_category)),
    ("encoder",OrdinalEncoder(categories=[["short","medium","long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("pipe1",duration_pipe1),
    ("pipe2",duration_pipe2),
    ("pipe3",StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers",Winsorizer(capping_method="iqr",fold=1.5)),
    ("imputer",SimpleImputer(strategy="median")),
    ("union",duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("",FunctionTransformer(func=is_direct))
])

# additional info
info_pipe1 = Pipeline(steps=[
    ("groups",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("encoder",OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
    ("pipe1", info_pipe1),
    ("pipe2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
])


# column transformer
column_transformer = ColumnTransformer(transformers=[
    ("airline",air_transformer,['airline']),
    ("doj",doj_transformer,["date_of_journey"]),
    ("location",location_transformer,["source","destination"]),
    ("time",time_transformer,["dep_time","arrival_time"]),
    ("duration",duration_transformer,["duration"]),
    ("stops",total_stops_transformer,["total_stops"]),
    ("info",info_transformer,["additional_info"])
],remainder="passthrough")

preprocessor = Pipeline(steps=[
    ("ct",column_transformer)
])

### testinng on  val data

In [41]:
val = pd.read_csv("../data/val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-24,Delhi,Cochin,20:25:00,01:30:00,305,1.0,No Info,5054
1,Multiple Carriers,2019-06-12,Delhi,Cochin,09:45:00,22:30:00,765,1.0,No Info,9646
2,Jet Airways,2019-03-12,Banglore,New Delhi,22:55:00,15:15:00,980,1.0,In-flight meal not included,11087
3,Multiple Carriers,2019-06-06,Delhi,Cochin,13:00:00,21:00:00,480,1.0,No Info,13587
4,Jet Airways,2019-05-18,Delhi,Cochin,23:05:00,04:25:00,1760,2.0,No Info,16704
...,...,...,...,...,...,...,...,...,...,...
1669,Spicejet,2019-05-01,Chennai,Kolkata,09:45:00,12:00:00,135,0.0,No Info,3597
1670,Indigo,2019-05-01,Kolkata,Banglore,08:10:00,13:00:00,290,1.0,No Info,5069
1671,Jet Airways,2019-05-27,Delhi,Cochin,05:30:00,12:35:00,425,2.0,In-flight meal not included,15544
1672,Jet Airways,2019-06-12,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included,3210


In [42]:
preprocessor.fit(
    val.drop(columns="price"),
    val.price.copy()
)

In [43]:
val_preprocessed = preprocessor.transform(val.drop(columns="price"))

In [44]:
val_preprocessed

Unnamed: 0,airline__airline_Air India,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_Multiple Carriers,airline__airline_other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,duration__duration_rbg_25,duration__duration_rbg_50,duration__duration_rbg_75,duration__duration_cat,duration__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.000000,0.974576,0.962638,0.962648,1,0,0.869565,0.454545,0.043478,0.545455,0.080214,0.960784,-0.348814,-0.096097,-0.07865,1.0,-0.654286,1.0,0,0.0,1.0,0.0,0
1,0.0,0.0,0.0,1.0,0.0,1.000000,0.882353,0.333333,0.872881,0.962638,0.962648,1,0,0.391304,0.818182,0.956522,0.545455,1.000000,0.960784,-0.348814,-0.096097,-0.07865,2.0,0.302200,1.0,0,0.0,1.0,0.0,0
2,0.0,0.0,1.0,0.0,0.0,0.000000,0.117647,0.166667,0.093220,-1.122166,-1.102594,0,1,0.956522,1.000000,0.652174,0.272727,0.080214,0.000000,-0.348814,-0.096097,-0.07865,2.0,0.749252,1.0,0,1.0,0.0,0.0,1
3,0.0,0.0,0.0,1.0,0.0,1.000000,0.823529,0.500000,0.822034,0.962638,0.962648,1,0,0.565217,0.000000,0.913043,0.000000,0.000000,0.960784,-0.348814,-0.096097,-0.07865,2.0,-0.290406,1.0,0,0.0,1.0,0.0,0
4,0.0,0.0,1.0,0.0,0.0,0.666667,0.647059,0.833333,0.661017,0.962638,0.962648,1,0,1.000000,0.090909,0.173913,0.454545,0.080214,1.000000,-0.348814,-0.096097,-0.07865,2.0,2.371119,2.0,0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1669,0.0,0.0,0.0,0.0,1.0,0.666667,0.529412,0.333333,0.516949,-1.820892,-1.102594,0,1,0.391304,0.818182,0.521739,0.000000,1.000000,0.000000,-0.348814,-0.096097,-0.07865,0.0,-1.007770,0.0,1,0.0,1.0,0.0,0
1670,0.0,1.0,0.0,0.0,0.0,0.666667,0.529412,0.333333,0.516949,-0.000378,-0.003875,1,0,0.347826,0.181818,0.565217,0.000000,1.000000,0.000000,-0.348814,-0.096097,-0.07865,1.0,-0.685476,1.0,0,0.0,1.0,0.0,0
1671,0.0,0.0,1.0,0.0,0.0,0.666667,0.764706,0.000000,0.737288,0.962638,0.962648,1,0,0.217391,0.545455,0.521739,0.636364,1.000000,0.000000,-0.348814,-0.096097,-0.07865,2.0,-0.404768,2.0,0,1.0,0.0,0.0,1
1672,0.0,0.0,1.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,-1.820892,-1.102594,1,0,0.826087,0.636364,0.913043,0.090909,0.171123,0.960784,-0.348814,-0.096097,-0.07865,0.0,-1.101339,0.0,1,1.0,0.0,0.0,1


xgb regressor

In [109]:
y_pred = xgb.predict(val_preprocessed)

mse = mean_squared_error(val.price, y_pred)
mae = mean_absolute_error(val.price, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(val.price,y_pred))

Mean Squared Error: 4571920.869901686
Mean Absolute Error: 1287.9204945399058
0.7946425849616279


random forest regressor

In [110]:
y_pred = rf.predict(val_preprocessed)

mse = mean_squared_error(val.price, y_pred)
mae = mean_absolute_error(val.price, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(val.price,y_pred))

Mean Squared Error: 3295450.6278607757
Mean Absolute Error: 839.1812890429727
0.8519779231571385


extra trees

In [136]:
y_pred = extra_trees.predict(val_preprocessed)

mse = mean_squared_error(val.price, y_pred)
mae = mean_absolute_error(val.price, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(val.price,y_pred))

Mean Squared Error: 3626548.355768488
Mean Absolute Error: 846.9140101553166
0.8371059742623476


### hyperparameter tuning on val data

#### Random Forest hyperparameter tuning

In [65]:
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 400, 500],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [66]:
pipeline = Pipeline(steps=[
    ("regressor",RandomForestRegressor())
])

In [67]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)

In [68]:
grid_searchcv = GridSearchCV(pipeline,param_grid,cv=kfold,scoring="r2",n_jobs=-1,verbose=4) 

In [69]:
grid_searchcv.fit(val_preprocessed,val.price) 

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


In [70]:
grid_searchcv.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [71]:
grid_searchcv.best_score_

0.7972454449025872

#### XGBoost hyperparameter tuning

In [275]:
params_grid = {
    'regressor__max_depth': range (2, 10, 1),
    'regressor__n_estimators': range(60, 220, 40),
    'regressor__learning_rate': [0.1, 0.01, 0.05]
}

In [278]:
pipeline = Pipeline(steps=[
    ("regressor",XGBRegressor())
])

In [279]:
kfold = KFold(n_splits=10,shuffle=True, random_state=42)

In [280]:
grid_searchcv = GridSearchCV(pipeline,params_grid,cv=kfold,scoring="r2",n_jobs=-1,verbose=4) 

In [281]:
grid_searchcv.fit(val_preprocessed,val.price) 

Fitting 10 folds for each of 96 candidates, totalling 960 fits


In [282]:
grid_searchcv.best_params_

{'regressor__learning_rate': 0.05,
 'regressor__max_depth': 8,
 'regressor__n_estimators': 60}

In [283]:
grid_searchcv.best_score_

0.776387691547505

#### extra tree regressor tuning

In [122]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200],  
    'regressor__max_features': ['auto', 'sqrt', 'log2'], 
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],  
    'regressor__min_samples_leaf': [1, 2, 4],  
    'regressor__bootstrap': [True, False]  
}

In [123]:
pipeline = Pipeline(steps=[
    ("regressor",ExtraTreesRegressor())
])

In [124]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)

In [125]:
grid_searchcv = GridSearchCV(pipeline,param_grid,cv=kfold,scoring="r2",n_jobs=-1,verbose=4) 

In [126]:
grid_searchcv.fit(val_preprocessed,val.price) 

Fitting 10 folds for each of 648 candidates, totalling 6480 fits


In [127]:
grid_searchcv.best_params_

{'regressor__bootstrap': False,
 'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__min_samples_leaf': 2,
 'regressor__min_samples_split': 2,
 'regressor__n_estimators': 200}

In [128]:
grid_searchcv.best_score_

0.788421762596632

### testing on test data

In [49]:
test = pd.read_csv("../data/test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Banglore,New Delhi,08:00:00,08:15:00,1455,1.0,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4462
3,Indigo,2019-06-27,Chennai,Kolkata,19:35:00,21:55:00,140,0.0,No Info,3597
4,Indigo,2019-05-06,Kolkata,Banglore,15:15:00,17:45:00,150,0.0,No Info,4804
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1.0,In-flight meal not included,12898
2089,Multiple Carriers,2019-06-27,Delhi,Cochin,11:25:00,19:15:00,470,1.0,No Info,7155
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1.0,No Info,6795


In [50]:
preprocessor.fit(
    test.drop(columns="price"),
    test.price.copy()
)

In [51]:
test_preprocessed = preprocessor.transform(test.drop(columns="price"))

In [52]:
test_preprocessed

Unnamed: 0,airline__airline_Air India,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_Multiple Carriers,airline__airline_other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,duration__duration_rbg_25,duration__duration_rbg_50,duration__duration_rbg_75,duration__duration_cat,duration__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,0.0,1.0,0.0,0.0,0.000000,0.058824,0.333333,0.042373,-0.932878,-0.904354,0,1,0.347826,0.000000,0.347826,0.272727,1.000000,0.656977,-0.344376,-0.119053,-0.091886,2.0,1.671080,1.0,0,0.0,1.0,0.0,0
1,0.0,0.0,0.0,0.0,1.0,1.000000,0.823529,0.500000,0.822034,-0.246375,-0.240710,1,0,0.956522,0.363636,0.000000,0.727273,0.113997,1.000000,-0.344376,-0.119053,-0.091886,0.0,-0.970097,0.0,1,0.0,1.0,0.0,0
2,0.0,1.0,0.0,0.0,0.0,0.000000,0.176471,0.000000,0.144068,-0.246375,-0.240710,1,0,0.217391,0.545455,0.347826,0.363636,1.000000,0.656977,3.223386,-0.119053,-0.091886,0.0,-0.909842,0.0,1,0.0,1.0,0.0,0
3,0.0,1.0,0.0,0.0,0.0,1.000000,1.000000,0.500000,1.000000,-1.875194,-0.904354,0,1,0.826087,0.636364,0.913043,1.000000,0.226551,1.000000,-0.344376,-0.119053,-0.091886,0.0,-0.970097,0.0,1,0.0,1.0,0.0,0
4,0.0,1.0,0.0,0.0,0.0,0.666667,0.588235,0.000000,0.559322,-0.246375,-0.240710,1,0,0.652174,0.272727,0.739130,0.818182,0.000000,0.590116,-0.344376,-0.119053,-0.091886,0.0,-0.950012,0.0,1,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2088,0.0,0.0,1.0,0.0,0.0,0.666667,0.764706,0.000000,0.737288,1.033057,1.032427,1,0,0.826087,0.272727,0.521739,0.636364,0.226551,0.000000,-0.344376,-0.119053,-0.091886,2.0,0.837553,1.0,0,1.0,0.0,0.0,1
2089,0.0,0.0,0.0,1.0,0.0,1.000000,1.000000,0.500000,1.000000,1.033057,1.032427,1,0,0.478261,0.454545,0.826087,0.272727,1.000000,0.590116,-0.344376,-0.119053,-0.091886,2.0,-0.307292,1.0,0,0.0,1.0,0.0,0
2090,0.0,0.0,1.0,0.0,0.0,1.000000,0.823529,0.000000,0.796610,1.033057,1.032427,1,0,0.086957,0.272727,0.173913,0.454545,0.113997,0.656977,-0.344376,-0.119053,-0.091886,2.0,1.902057,1.0,0,1.0,0.0,0.0,1
2091,0.0,0.0,0.0,1.0,0.0,1.000000,0.823529,0.500000,0.822034,1.033057,1.032427,1,0,0.652174,0.272727,0.043478,0.545455,0.000000,1.000000,-0.344376,-0.119053,-0.091886,2.0,-0.016059,1.0,0,0.0,1.0,0.0,0


xgb regressor

In [112]:
y_pred = xgb.predict(test_preprocessed)

mse = mean_squared_error(test.price, y_pred)
mae = mean_absolute_error(test.price, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(test.price,y_pred))

Mean Squared Error: 3839523.6245732205
Mean Absolute Error: 1125.707566831309
0.8098372234213442


random forest regressor

In [113]:
y_pred = rf.predict(test_preprocessed)

mse = mean_squared_error(test.price, y_pred)
mae = mean_absolute_error(test.price, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(test.price,y_pred))

Mean Squared Error: 3112030.8202294013
Mean Absolute Error: 826.9807857650976
0.8458682692338025


extra trees

In [137]:
y_pred = extra_trees.predict(test_preprocessed)

mse = mean_squared_error(test.price, y_pred)
mae = mean_absolute_error(test.price, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print(r2_score(test.price,y_pred))

Mean Squared Error: 3333280.5003667795
Mean Absolute Error: 774.9137912087912
0.8349102813149913
