## 1. Import The Libraries

In [57]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.0


In [2]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading feature_engine-1.8.0-py2.py3-none-any.whl (357 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.1/357.1 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature-engine
Successfully installed feature-engine-1.8.0


In [61]:
import os 

import pickle

import boto3

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import r2_score
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.preprocessing import (
    OneHotEncoder, 
    MinMaxScaler, 
    StandardScaler,
    PowerTransformer,
    FunctionTransformer,
    OrdinalEncoder
)

from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder, 
    CountFrequencyEncoder
)

import warnings

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

## 2. Display Settings

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
sklearn.set_config(transform_output='pandas')

In [6]:
warnings.filterwarnings("ignore")

## 3. Read The DataSets

In [7]:
train =  pd.read_csv('train.csv')
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-06,Delhi,Cochin,14:20:00,22:30,490,1.0,No Info,6938
1,Multiple Carriers,2019-06-15,Delhi,Cochin,11:30:00,19:15,465,1.0,No Info,16108
2,Jet Airways,2019-05-12,Kolkata,Banglore,17:00:00,22:05,1745,1.0,No Info,13067
3,Air India,2019-05-09,Delhi,Cochin,14:25:00,19:15,1730,2.0,No Info,13591
4,Air India,2019-03-06,Mumbai,Hyderabad,12:45:00,19:25,1840,2.0,No Info,13253
...,...,...,...,...,...,...,...,...,...,...
635,Multiple Carriers,2019-05-18,Delhi,Cochin,10:20:00,19:00,520,1.0,No Info,9794
636,Jet Airways,2019-05-21,Delhi,Cochin,08:00:00,04:25,1225,1.0,In-flight meal not included,12898
637,Jet Airways,2019-05-24,Kolkata,Banglore,09:35:00,09:45,1450,1.0,No Info,13067
638,Indigo,2019-06-12,Banglore,Delhi,07:10:00,10:05,175,0.0,No Info,4823


In [9]:
val = pd.read_csv('validation.csv')
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-09,Delhi,Cochin,18:15:00,12:35,1100,1.0,In-flight meal not included,12373
1,Jet Airways,2019-05-03,Mumbai,Hyderabad,07:05:00,08:30,85,0.0,In-flight meal not included,4995
2,Jet Airways,2019-05-18,Delhi,Cochin,02:15:00,04:25,1570,1.0,In-flight meal not included,12373
3,Indigo,2019-05-15,Chennai,Kolkata,22:05:00,00:25,140,0.0,No Info,5277
4,Multiple Carriers,2019-05-15,Delhi,Cochin,14:00:00,01:30,690,1.0,No Info,13727
...,...,...,...,...,...,...,...,...,...,...
155,Air Asia,2019-05-15,Kolkata,Banglore,06:50:00,10:30,220,1.0,No Info,5162
156,Air India,2019-03-03,Delhi,Cochin,05:10:00,08:00,170,0.0,No Info,7931
157,Goair,2019-03-01,Banglore,New Delhi,20:55:00,23:50,175,0.0,No Info,18558
158,Multiple Carriers,2019-03-21,Delhi,Cochin,11:30:00,21:00,570,1.0,No Info,13062


In [10]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-09,Kolkata,Banglore,06:30:00,04:40,1330,1.0,In-flight meal not included,9663
1,Jet Airways,2019-04-09,Banglore,Delhi,15:50:00,18:45,175,0.0,No Info,7229
2,Jet Airways,2019-05-27,Delhi,Cochin,05:30:00,12:35,425,2.0,In-flight meal not included,15544
3,Jet Airways,2019-03-03,Delhi,Cochin,16:00:00,18:50,1610,1.0,No Info,17234
4,Indigo,2019-06-27,Delhi,Cochin,21:50:00,03:35,345,1.0,No Info,5775
...,...,...,...,...,...,...,...,...,...,...
195,Jet Airways,2019-05-27,Delhi,Cochin,11:30:00,19:00,450,1.0,No Info,16079
196,Air India,2019-05-24,Kolkata,Banglore,09:10:00,04:40,1170,2.0,No Info,6117
197,Jet Airways,2019-03-06,Banglore,New Delhi,22:50:00,09:30,640,1.0,No Info,17261
198,Jet Airways,2019-05-01,Kolkata,Banglore,06:30:00,18:15,705,1.0,No Info,14781


## 4. preprocessing Operations

In [14]:
# airline

air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with='other', n_categories=2)),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

# doj

feature_to_extract = ['month','week', 'day_of_week','day_of_year']

doj_transformer = Pipeline(steps=[
    ('dt', DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format='mixed')),
    ("scaler", MinMaxScaler())
])

# source & destination

location_pipe1 = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, replace_with='other', n_categories=2)),
    ('mean_encoding', MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(x):
    columns = x.columns.to_list()
    north_cities = ['Delhi', 'New Delhi']
    return (
        x.assign(**{
            f"{col}_is_north": x.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transform = FeatureUnion(transformer_list=[
    ('part1', location_pipe1),
    ('part2', FunctionTransformer(func=is_north))
])

# departure & arrival time

time_pipe1 = Pipeline(steps=[
    ('dt', DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('Mm', MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col : pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:,col].between(morning, noon, inclusive='left'),
                X_temp.loc[:,col].between(noon, eve, inclusive='left'),
                X_temp.loc[:,col].between(eve, night, inclusive='left')],
                ['morning', 'afternoon', 'evening'],
                default = 'night'
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ('part', FunctionTransformer(func=part_of_day)),
    ('encoder', CountFrequencyEncoder()),
    ('scaler', MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ('part1', time_pipe1),
    ('part2', time_pipe2)
])

# duration

class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)
    
def duration_category(X, short = 180, med = 400):
    return (
        X
        .assign(
            duration_cat=np.select(
                [X.duration.lt(short),
                 X.duration.between(short, med, inclusive='left')],
                ['short','medium'],
                default = 'long'
            )
        )
        .drop(columns='duration')
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns='duration')
    )

duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])

# total stops

def is_direct(X):
	return (
        X
        .assign(
            is_direct_flight=X.total_stops.eq(0).astype(int)
        )
                
    )


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("dircet_flight", FunctionTransformer(func=is_direct))
])

# additional info

info_pipe1 = Pipeline(steps=[
    ('grouper', RareLabelEncoder(tol=0.1, n_categories=2, replace_with='others')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

def have_info(X):
    return (
        X
        .assign(
            additional_info= X.additional_info.ne('No Info').astype(int)
        )
    )

info_union = FeatureUnion(transformer_list=[
    ('part1', info_pipe1),
    ('part2', FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('union', info_union)
])

# column transformer

column_transformer = ColumnTransformer(transformers=[
    ('air', air_transformer, ['airline']),
    ('doj', doj_transformer,['date_of_journey']),
    ('location', location_transform,['source','destination']),
    ('time', time_transformer, ['dep_time','arrival_time']),
    ('dur', duration_transformer, ['duration']),
    ('stops', total_stops_transformer, ['total_stops']),
    ('info', info_transformer, ['additional_info'])
    
], remainder='passthrough')

# selector

estimator = RandomForestRegressor(n_estimators=10,max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
    estimator=estimator, 
    scoring='r2', 
    threshold=0.1
)

# preprocessor 

preprocessor = Pipeline(steps=[
    ('ct', column_transformer),
    ('selector', selector)
])

In [15]:
preprocessor.fit(
    train.drop(columns='price'),
    train.price.copy()
)

In [16]:
preprocessor.transform(train.drop(columns='price'))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,0.0,0.823529,0.822034,1.033319,1.031179,-0.411171,2.0,0,-0.289409,1.0,0
1,0.0,0.0,0.0,0.882353,0.898305,1.033319,1.031179,-0.411171,2.0,0,-0.338552,1.0,0
2,0.0,1.0,0.0,0.588235,0.610169,-0.013882,-0.064799,-0.411171,2.0,1,2.177531,1.0,0
3,0.0,0.0,0.0,0.588235,0.584746,1.033319,1.031179,-0.411171,2.0,1,2.148046,2.0,0
4,0.0,0.0,0.0,0.058824,0.042373,-1.749644,-0.819113,-0.411171,2.0,1,2.364272,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,0.0,0.647059,0.661017,1.033319,1.031179,-0.411171,2.0,0,-0.230439,1.0,0
636,0.0,1.0,0.0,0.705882,0.686441,1.033319,1.031179,-0.411171,2.0,1,1.155373,1.0,0
637,0.0,1.0,0.0,0.705882,0.711864,-0.013882,-0.064799,-0.411171,2.0,1,1.597653,1.0,0
638,1.0,0.0,0.0,0.882353,0.872881,-1.050080,-1.743092,1.978845,0.0,0,-0.908602,0.0,1


## 4. Preprocess data and upload to bucket

In [22]:
BUCKET_NAME = 'flights-price-sagemaker-projects'

DATA_PREFIX = "data"

In [23]:
def get_file_name(name):
    return f"{name}_pre.csv"

In [24]:
def export_data(data, name, pre):
    # split the data into X and y
    X = train.drop(columns='price')
    y = train.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting the dataset
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index = False, header = False)
    )

>- Sage Maker requires the data in differnt format.
>- Target feture should be frist and it wont have any header and index

In [32]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [33]:
def export_and_upload(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [34]:
export_and_upload(train, "train", preprocessor)

In [36]:
export_and_upload(val, "val", preprocessor)

In [38]:
export_and_upload(test, "test", preprocessor)

## 5. Model & Hyperparameter Tuning Setup

In [43]:
session  =sagemaker.Session()

region_name = session.boto_region_name

In [44]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [46]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve('xgboost', region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=5,# 5gb
    output_path=output_path,
    use_spot_instances=True,
    max_run=500,
    max_wait=800,
    sagemaker_session=session
)

In [47]:
model.set_hyperparameters(
    objective="reg:linear", # mse
    num_round=10, # number of estimators
    eta=0.1, # learning rate
    max_depth=5,
    subsample=0.8, # for eeach tree randomly sample 80% of the rows
    colsample_bytree=0.8, # for each tree random columns were used 
    alpha=0.1 # l2 regularization
)

In [48]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05,0.2),
    "alpha": ContinuousParameter(0,1),
    "max_depth": IntegerParameter(3,5)
}

In [49]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:mse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian", # randomly select few parameter and evaultes based on score and assigns a prob --> On next iter it will think and descide which parameters have to use
    objective_type="Minimize"
)

## 6. Data Channel

>- Channel : the data we are goint to train is in the bucket thats what channel is

In [51]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [52]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f65df94afe0>

In [53]:
val_data_channel = get_data_channel("val")

In [54]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [55]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.............................................!


In [56]:
# tuner.best_estimator().deploy()


2024-06-28 11:11:43 Starting - Preparing the instances for training
2024-06-28 11:11:43 Downloading - Downloading the training image
2024-06-28 11:11:43 Training - Training image download completed. Training in progress.
2024-06-28 11:11:43 Uploading - Uploading generated training model
2024-06-28 11:11:43 Completed - Training job completed


<sagemaker.estimator.Estimator at 0x7f65e4bb4df0>

## 8. Model Evaluation

In [60]:
with open('xgboost-model', 'rb') as f:
    best_model = pickle.load(f)
    
best_model 

<xgboost.core.Booster at 0x7f65e4790d30>

In [64]:
def evaluate_model(name):
    
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:,1:])
    y = data.iloc[:,0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y,pred)

In [72]:
evaluate_model('train')

-0.17174791277356172

In [70]:
evaluate_model('test')

-0.17174791277356172

In [68]:
evaluate_model('val')

-0.17174791277356172