## Training Model

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
import pandas as pd
from sklearn import metrics

In [3]:
import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
sklearn.set_config(transform_output="pandas")

In [6]:
warnings.filterwarnings("ignore")

In [7]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)


def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [44]:
class train_model():
    def __init__(self):
        self.modelmlg = LinearRegression()
        self.modelRE=Ridge()
        self.modelLO=Lasso(alpha=0.1)

        self.modeldcr = DecisionTreeRegressor()
        self.modelrfr = RandomForestRegressor()
        self.modelSVR = SVR()
        self.modelXGR = xgb.XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
        self.modelGBR = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, subsample=1.0,
                                     criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                     init=None, random_state=None, max_features=None,
                                     alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
                                     validation_fraction=0.1, n_iter_no_change=None, tol=0.00)
        
        self.models = [self.modelmlg, self.modelRE, self.modelLO, self.modeldcr, self.modelrfr, self.modelSVR, self.modelXGR, self.modelGBR]
        self.preprocessor = Pipeline(steps=[("ct", column_transformer),("selector", selector)])
        self.result_training = {'Model Name':[], 'Mean_Absolute_Error':[] , 'Mean_Squared_Error_MSE':[] ,'Root_Mean_Squared_Error_RMSE':[] ,'R2_score':[]}
        self.result_testing = {'Model Name':[], 'Mean_Absolute_Error':[] , 'Mean_Squared_Error_MSE':[] ,'Root_Mean_Squared_Error_RMSE':[] ,'R2_score':[]}
    
    
    def show_metric(self,during):
        if during == "training":
            display(pd.DataFrame(self.result_training))
            # pd.DataFrame(self.result_training).to_csv(r"U:\nlp_project\flight-prices-prediction\metric\training.csv")
        else:
            display(pd.DataFrame(self.result_testing))
            # pd.DataFrame(self.result_training).to_csv(r"U:\nlp_project\flight-prices-prediction\metric\testing.csv")


    def print_metrics(self,model,true_value,pred_value,during):
        print(type(model).__name__)
        print("-----------------------------------------------------------------------------------------------------------")
        print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(true_value, pred_value),3))
        print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(true_value, pred_value),3))
        print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(true_value, pred_value)),3))
        print('R2_score:', round(metrics.r2_score(true_value, pred_value),6))
        
        if during == "training":
            self.result_training['Model Name'].append(type(model).__name__)
            self.result_training['Mean_Absolute_Error'].append(round(metrics.mean_absolute_error(true_value, pred_value),3))
            self.result_training['Mean_Squared_Error_MSE'].append(round(metrics.mean_squared_error(true_value, pred_value),3))
            self.result_training['Root_Mean_Squared_Error_RMSE'].append(round(np.sqrt(metrics.mean_squared_error(true_value, pred_value)),3))
            self.result_training['R2_score'].append(round(metrics.r2_score(true_value, pred_value),6))
        else:
            self.result_testing['Model Name'].append(type(model).__name__)
            self.result_testing['Mean_Absolute_Error'].append(round(metrics.mean_absolute_error(true_value, pred_value),3))
            self.result_testing['Mean_Squared_Error_MSE'].append(round(metrics.mean_squared_error(true_value, pred_value),3))
            self.result_testing['Root_Mean_Squared_Error_RMSE'].append(round(np.sqrt(metrics.mean_squared_error(true_value, pred_value)),3))
            self.result_testing['R2_score'].append(round(metrics.r2_score(true_value, pred_value),6))

        print("\n")

    
    def train(self):
        # models = [self.modelmlg, self.modelRE, self.modelLO, self.modeldcr, self.modelrfr, self.modelSVR, self.modelXGR, self.modelGBR]
        x_train = pd.read_csv(r"U:\nlp_project\flight-prices-prediction\data_b\train.csv")
        x_test = pd.read_csv(r"U:\nlp_project\flight-prices-prediction\data_b\test.csv")
        
        self.preprocessor.fit(
                              x_train.drop(columns="price"),
                              x_train.price.copy()
                                   )
        
        x_tr = preprocessor.transform(x_train.drop(columns="price"))
        y_tr = x_train.price.copy()
        
        x_tst = preprocessor.transform(x_test.drop(columns="price"))
        y_tst = x_test.price.copy()

        for model in self.models:
            model.fit(x_tr,y_tr)
            y_pred = model.predict(x_tr)
            self.print_metrics(model,y_tr,y_pred,"training")
        
        
    def testing(self):
        x_test = pd.read_csv(r"U:\nlp_project\flight-prices-prediction\data_b\test.csv")
        x_tst = preprocessor.transform(x_test.drop(columns="price"))
        y_tst = x_test.price.copy()
        for model in self.models:
            y_pred = model.predict(x_tst)
            self.print_metrics(model,y_tst,y_pred,"testing")
            
            
    
    def save_models(self):
        pass            

In [45]:
tr = train_model()
tr.train()

LinearRegression
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 2059.127
Mean Squared Error (MSE): 10019563.319
Root Mean Squared Error (RMSE): 3165.369
R2_score: 0.535046


Ridge
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 2058.487
Mean Squared Error (MSE): 10037637.788
Root Mean Squared Error (RMSE): 3168.223
R2_score: 0.534207


Lasso
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 2058.369
Mean Squared Error (MSE): 10020196.128
Root Mean Squared Error (RMSE): 3165.469
R2_score: 0.535017


DecisionTreeRegressor
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 534.578
Mean Squared Error (MSE): 1336546.675
Root Mean Squared Error (RMSE): 1

In [47]:
tr.testing()

LinearRegression
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 2083.641
Mean Squared Error (MSE): 9164668.446
Root Mean Squared Error (RMSE): 3027.32
R2_score: 0.556979


Ridge
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 2082.913
Mean Squared Error (MSE): 9185167.858
Root Mean Squared Error (RMSE): 3030.704
R2_score: 0.555988


Lasso
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 2082.921
Mean Squared Error (MSE): 9165470.849
Root Mean Squared Error (RMSE): 3027.453
R2_score: 0.55694


DecisionTreeRegressor
-----------------------------------------------------------------------------------------------------------
Mean Absolute Error (MAE): 1610.618
Mean Squared Error (MSE): 7216100.937
Root Mean Squared Error (RMSE): 2686.

In [97]:
tr.show_metric("training")

Unnamed: 0,Model Name,Mean_Absolute_Error,Mean_Squared_Error_MSE,Root_Mean_Squared_Error_RMSE,R2_score
0,LinearRegression,2059.127,10019560.0,3165.369,0.535046
1,Ridge,2058.487,10037640.0,3168.223,0.534207
2,Lasso,2058.369,10020200.0,3165.469,0.535017
3,DecisionTreeRegressor,534.578,1336547.0,1156.091,0.937978
4,RandomForestRegressor,774.746,1778279.0,1333.521,0.91748
5,SVR,3105.131,18388590.0,4288.192,0.146684
6,XGBRegressor,1020.076,2354148.0,1534.323,0.890757
7,GradientBoostingRegressor,1505.802,5219355.0,2284.591,0.757798


In [98]:
tr.show_metric("training")

Unnamed: 0,Model Name,Mean_Absolute_Error,Mean_Squared_Error_MSE,Root_Mean_Squared_Error_RMSE,R2_score
0,LinearRegression,2059.127,10019560.0,3165.369,0.535046
1,Ridge,2058.487,10037640.0,3168.223,0.534207
2,Lasso,2058.369,10020200.0,3165.469,0.535017
3,DecisionTreeRegressor,534.578,1336547.0,1156.091,0.937978
4,RandomForestRegressor,774.746,1778279.0,1333.521,0.91748
5,SVR,3105.131,18388590.0,4288.192,0.146684
6,XGBRegressor,1020.076,2354148.0,1534.323,0.890757
7,GradientBoostingRegressor,1505.802,5219355.0,2284.591,0.757798


NoneType

In [49]:
tr.show_metric("testing")

Unnamed: 0,Model Name,Mean_Absolute_Error,Mean_Squared_Error_MSE,Root_Mean_Squared_Error_RMSE,R2_score
0,LinearRegression,2083.641,9164668.0,3027.32,0.556979
1,Ridge,2082.913,9185168.0,3030.704,0.555988
2,Lasso,2082.921,9165471.0,3027.453,0.55694
3,DecisionTreeRegressor,1610.618,7216101.0,2686.28,0.651173
4,RandomForestRegressor,1425.83,5025815.0,2241.833,0.757052
5,SVR,3107.682,17246110.0,4152.844,0.166322
6,XGBRegressor,1367.18,4310465.0,2076.166,0.791632
7,GradientBoostingRegressor,1540.202,4806352.0,2192.339,0.767661


In [51]:
x_test = pd.read_csv(r"U:\nlp_project\flight-prices-prediction\data_b\test.csv")
x_tst = preprocessor.transform(x_test.drop(columns="price"))
y_tst = x_test.price.copy()
y_pred = tr.modelGBR.predict(x_tst)

In [53]:
d = {
    "true_value" : y_tst,
    "pred_value" : y_pred
}

In [54]:
pd.DataFrame(d)

Unnamed: 0,true_value,pred_value
0,17996,15473.126376
1,3873,4368.704738
2,4462,5201.708356
3,2228,4961.260433
4,4991,4495.880996
...,...,...
2088,12898,11935.795275
2089,12898,11935.795275
2090,11627,11735.491384
2091,6795,10096.735732


In [55]:
x_test = pd.read_csv(r"U:\nlp_project\flight-prices-prediction\data_b\test.csv")
x_tst = preprocessor.transform(x_test.drop(columns="price"))
y_tst = x_test.price.copy()
y_pred = tr.modelXGR.predict(x_tst)

In [56]:
d = {
    "true_value" : y_tst,
    "pred_value" : y_pred
}

In [57]:
pd.DataFrame(d)

Unnamed: 0,true_value,pred_value
0,17996,15996.664062
1,3873,4064.483643
2,4462,4096.822754
3,2228,4759.148438
4,4991,4640.753906
...,...,...
2088,12898,13201.742188
2089,12898,14154.578125
2090,11627,11528.740234
2091,6795,10405.914062


In [60]:
# with open(r"U:\nlp_project\flight-prices-prediction\models\GradientBoostingRegressor\modelGBR.pkl", 'wb') as file:
#     pickle.dump(tr.modelGBR, file)

In [61]:
# with open(r"U:\nlp_project\flight-prices-prediction\models\XGBRegressor\modelXGR.pkl", 'wb') as file:
#     pickle.dump(tr.modelXGR, file)

In [83]:
# model = xgb.XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)

In [84]:
# x_train = pd.read_csv(r"U:\nlp_project\flight-prices-prediction\data_b\train.csv")
# x_tr = preprocessor.transform(x_train.drop(columns="price"))
# y_tr = x_train.price.copy()

In [85]:
# model.fit(x_tr,y_tr)

In [86]:
# pred_value=model.predict(x_tr)

In [95]:
# metrics.r2_score(y_tr, pred_value)

In [88]:
# x_test = pd.read_csv(r"U:\nlp_project\flight-prices-prediction\data_b\test.csv")
# x_tst = preprocessor.transform(x_test.drop(columns="price"))
# y_tst = x_test.price.copy()

In [89]:
# pred = model.predict(x_tst)

In [90]:
# pred.shape

In [91]:
# y_tst.shape

In [93]:
# metrics.r2_score(y_tst, pred)