## Importing libararies 

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [3]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading feature_engine-1.8.0-py2.py3-none-any.whl (357 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.1/357.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: feature-engine
Successfully installed feature-engine-1.8.0


In [6]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [7]:
pd.set_option("display.max_columns", None)

In [8]:
sklearn.set_config(transform_output="pandas")

In [9]:
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [10]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,date_of_Journey,price
0,Jet Airways,Delhi,Cochin,14:35:00,12:35:00,1320,2.0,In-flight meal not included,2019-06-15,10919
1,Spicejet,Banglore,New Delhi,10:20:00,18:15:00,475,1.0,No Info,2019-03-09,7294
2,Indigo,Banglore,New Delhi,20:00:00,22:55:00,175,0.0,No Info,2019-03-03,6860
3,Multiple Carriers,Delhi,Cochin,04:55:00,19:15:00,860,1.0,No Info,2019-05-21,7575
4,Vistara,Banglore,Delhi,19:30:00,22:15:00,165,0.0,No Info,2019-04-06,5613
...,...,...,...,...,...,...,...,...,...,...
635,Multiple Carriers,Delhi,Cochin,14:00:00,01:30:00,690,1.0,No Info,2019-05-15,13727
636,Indigo,Mumbai,Hyderabad,12:00:00,13:30:00,90,0.0,No Info,2019-05-09,4392
637,Indigo,Banglore,Delhi,23:30:00,02:20:00,170,0.0,No Info,2019-04-27,3943
638,Jet Airways,Kolkata,Banglore,21:10:00,08:15:00,665,1.0,No Info,2019-05-01,14781


In [11]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,date_of_Journey,price
0,Spicejet,Kolkata,Banglore,15:05:00,20:20:00,315,1.0,No Info,2019-06-06,4649
1,Air India,Delhi,Cochin,12:00:00,19:15:00,435,2.0,No Info,2019-06-01,10336
2,Indigo,Kolkata,Banglore,17:15:00,19:55:00,160,0.0,No Info,2019-03-06,5618
3,Jet Airways,Delhi,Cochin,19:10:00,19:45:00,1475,2.0,In-flight meal not included,2019-03-27,8834
4,Air India,Delhi,Cochin,05:00:00,19:15:00,2295,2.0,No Info,2019-06-12,9443
...,...,...,...,...,...,...,...,...,...,...
155,Jet Airways,Delhi,Cochin,14:35:00,12:35:00,1320,2.0,In-flight meal not included,2019-06-15,10919
156,Air Asia,Banglore,Delhi,23:55:00,02:45:00,170,0.0,No Info,2019-05-06,3383
157,Air India,Delhi,Cochin,10:00:00,19:15:00,555,1.0,No Info,2019-05-15,8372
158,Spicejet,Banglore,Delhi,21:10:00,00:05:00,175,0.0,No check-in baggage included,2019-06-06,3257


In [12]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,date_of_Journey,price
0,Air India,Delhi,Cochin,13:20:00,19:15:00,1795,2.0,No Info,2019-03-21,8708
1,Jet Airways,Banglore,Delhi,07:10:00,10:10:00,180,0.0,In-flight meal not included,2019-06-18,5769
2,Indigo,Kolkata,Banglore,15:15:00,20:05:00,290,1.0,No Info,2019-05-12,5170
3,Multiple Carriers,Delhi,Cochin,08:00:00,21:00:00,780,1.0,No Info,2019-06-09,7408
4,Spicejet,Kolkata,Banglore,09:00:00,11:30:00,150,0.0,No Info,2019-03-09,5515
...,...,...,...,...,...,...,...,...,...,...
195,Indigo,Kolkata,Banglore,04:40:00,07:15:00,155,0.0,No Info,2019-04-27,4804
196,Multiple Carriers,Delhi,Cochin,15:15:00,01:30:00,615,1.0,In-flight meal not included,2019-06-24,5797
197,Air India,Delhi,Cochin,17:15:00,19:15:00,1560,2.0,No Info,2019-06-01,12970
198,Jet Airways,Kolkata,Banglore,21:10:00,10:05:00,775,1.0,No Info,2019-05-12,13941


## 4. Preprocessing Operations

In [17]:
#Airline
air_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#dateofjourney
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])

# source and destination 
location_pipe1 = Pipeline(steps=[
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", MeanEncoder()),
	("scaler", PowerTransformer())
])


def is_north(X):
	columns = X.columns.to_list()
	north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
	return (
		X
		.assign(**{
			f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
			for col in columns
		})
		.drop(columns=columns)
	)


location_transformer = FeatureUnion(transformer_list=[
	("part1", location_pipe1),
	("part2", FunctionTransformer(func=is_north))
])


# departure and arival times 
time_pipe1 = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
	("scaler", MinMaxScaler())
])


def part_of_day(X, morning=4, noon=12, eve=16, night=20):
	columns = X.columns.to_list()
	X_temp = X.assign(**{
		col: pd.to_datetime(X.loc[:, col]).dt.hour
		for col in columns
	})

	return (
		X_temp
		.assign(**{
			f"{col}_part_of_day": np.select(
				[X_temp.loc[:, col].between(morning, noon, inclusive="left"),
				 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
				 X_temp.loc[:, col].between(eve, night, inclusive="left")],
				["morning", "afternoon", "evening"],
				default="night"
			)
			for col in columns
		})
		.drop(columns=columns)
	)


time_pipe2 = Pipeline(steps=[
	("part", FunctionTransformer(func=part_of_day)),
	("encoder", CountFrequencyEncoder()),
	("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
	("part1", time_pipe1),
	("part2", time_pipe2)
])

# duration 
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)
    
def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)

def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)

duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])

#total stops 

def is_direct(X):
	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])


#additional info 
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

column_transformer = ColumnTransformer(transformers=[
	("air", air_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_Journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")

#feature selector 
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
)

#preprocessor

preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

In [18]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [19]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_Journey_day_of_year,location__source,location__destination,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.898305,0.997905,0.998859,2.0,1,1.486415,2.0,0
1,0.0,0.0,1.0,0.067797,-1.245774,-1.084051,2.0,0,-0.304722,1.0,0
2,1.0,0.0,0.0,0.016949,-1.245774,-1.084051,0.0,0,-0.940628,0.0,1
3,0.0,0.0,0.0,0.686441,0.997905,0.998859,2.0,0,0.511358,1.0,0
4,0.0,0.0,1.0,0.305085,-1.245774,-1.799110,0.0,0,-0.961825,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,0.0,0.635593,0.997905,0.998859,2.0,0,0.151011,1.0,0
636,1.0,0.0,0.0,0.584746,-1.740846,-1.084051,0.0,0,-1.120802,0.0,1
637,1.0,0.0,0.0,0.483051,-1.245774,-1.799110,0.0,0,-0.951227,0.0,1
638,0.0,1.0,0.0,0.516949,-0.148607,-0.179609,2.0,0,0.098019,1.0,0


## 4. Preprocess Data and Upload to Bucket

In [20]:
BUCKET_NAME = "sagemaker-flight-price-prediction"

DATA_PREFIX = "data"

In [21]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [22]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [23]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [25]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [26]:
export_and_upload_bucket(train, "train", preprocessor)

In [27]:
export_and_upload_bucket(val, "val", preprocessor)

In [28]:
export_and_upload_bucket(test, "test", preprocessor)

In [29]:
pd.read_csv("train-pre.csv")

Unnamed: 0,10919,0.0,1.0,0.0.1,0.8983050847457628,0.9979051743446965,0.998858952295499,2.0,1,1.4864147333697908,2.0.1,0
0,7294,0.0,0.0,1.0,0.067797,-1.245774,-1.084051,2.0,0,-0.304722,1.0,0
1,6860,1.0,0.0,0.0,0.016949,-1.245774,-1.084051,0.0,0,-0.940628,0.0,1
2,7575,0.0,0.0,0.0,0.686441,0.997905,0.998859,2.0,0,0.511358,1.0,0
3,5613,0.0,0.0,1.0,0.305085,-1.245774,-1.799110,0.0,0,-0.961825,0.0,1
4,6216,0.0,0.0,1.0,0.822034,0.997905,0.998859,1.0,0,-0.908833,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
634,13727,0.0,0.0,0.0,0.635593,0.997905,0.998859,2.0,0,0.151011,1.0,0
635,4392,1.0,0.0,0.0,0.584746,-1.740846,-1.084051,0.0,0,-1.120802,0.0,1
636,3943,1.0,0.0,0.0,0.483051,-1.245774,-1.799110,0.0,0,-0.951227,0.0,1
637,14781,0.0,1.0,0.0,0.516949,-0.148607,-0.179609,2.0,0,0.098019,1.0,0


## 5. Model and Hyperparameter Tuning Set-up

In [30]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [31]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [32]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [33]:
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

In [34]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [35]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 6. Data Channels

In [36]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [37]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f7b57a72bc0>

In [38]:
val_data_channel = get_data_channel("val")

In [39]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [40]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................!


In [47]:
#tuner.best_estimator().deploy()

## 8. Model Evaluation

In [49]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f7b5f3d7c10>

In [50]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [51]:
evaluate_model("train")

-0.006071361546032605

In [52]:
evaluate_model("val")

0.12274674969840305

In [53]:
evaluate_model("test")

0.08026495257440258