### 1. Import Libraries

In [1]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.3-py2.py3-none-any.whl (378 kB)
Installing collected packages: feature-engine
Successfully installed feature-engine-1.8.3


In [2]:
import numpy as np
import boto3
import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
import os
import pickle

In [4]:
print("hellow abhinay bir")

hellow abhinay bir


### 2. Display Settings

In [5]:
pd.set_option("display.max_columns", None)

In [6]:
sklearn.set_config(transform_output="pandas")

In [7]:
warnings.filterwarnings("ignore")

### 3. Read the Data

In [8]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
val= pd.read_csv("val.csv")


In [9]:
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-04-09,Delhi,Cochin,05:55:00,19:15:00,2240,2,No info,7711
1,Air India,2019-06-03,Delhi,Cochin,07:10:00,07:40:00,1470,2,No info,12698
2,Air India,2019-05-15,Kolkata,Banglore,16:45:00,21:05:00,1700,1,No info,7452
3,Vistara,2019-06-15,Chennai,Kolkata,07:05:00,09:20:00,135,0,No info,3687
4,Jet Airways,2019-04-09,Delhi,Cochin,23:05:00,19:00:00,1195,2,No info,9483


In [10]:
train.dtypes

airline            object
date_of_journey    object
source             object
destination        object
dep_time           object
arrival_time       object
duration            int64
total_stops         int64
additional_info    object
price               int64
dtype: object

### 4. Preprocessing Operations

In [11]:
# airline
airline_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("grouper", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ])

# date_of_journey
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]
doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps = [
  ("grouper", RareLabelEncoder(tol=0.05, n_categories=2, replace_with="Other")),  
  ("encoder", MeanEncoder()),
  ("scaler", PowerTransformer())
])
def is_north(df): # df is a df with 2 cols 'source' and 'destination'
  cols = df.columns.to_list()
  north_cities = ['Delhi']
  return(         
    df           
    .assign(**{  
        f"{col}_is_north": df.loc[:,col].isin(north_cities).astype(int)
        for col in cols   
    }) #assign is used to create or update  a colm
    .drop(columns=['source','destination'])
  )

location_transformer = FeatureUnion(transformer_list=[
    ("location_pipe1", location_pipe1),
    ("is_north", FunctionTransformer(func=is_north))
])

#dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"], yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])
def part_of_the_day(df, mor=7, noon=12, eve=16, night =20):
  cols = df.columns.to_list()
  df_temp = df.assign(**{
    col: pd.to_datetime(df.loc[:, col]).dt.hour
    for col in cols
  })
  return (
    df_temp
    .assign(**{
      f"{col}_part_of_day":np.select(
        [df_temp.loc[:, col].between(mor, noon, inclusive="left"),
         df_temp.loc[:, col].between(noon, eve, inclusive="left"),
         df_temp.loc[:, col].between(eve, night, inclusive="left"),
        ],
        ["morning", "afternoon", "evening" ],
        default="night"
      )
      for col in cols
    })
    .drop(columns=cols)
  )
time_pipe2 = Pipeline(steps=[
  ("part", FunctionTransformer(func=part_of_the_day)),
  ("encoder", CountFrequencyEncoder()),
  ("scaler", MinMaxScaler())
])
time_transformer = FeatureUnion(transformer_list=[
    ("time_pipe1", time_pipe1),
    ("time_pipe2", time_pipe2)
])

#duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)
def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)
def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)
duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])
duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])
duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])
duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])

#total_stops
def is_direct(X):
	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])

#additional_info
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

# Column Transformer
column_transformer = ColumnTransformer(transformers=[
        ("airline", airline_transformer, ["airline"]),
        ("date_of_journey", doj_transformer, ["date_of_journey"]),
        ("location", location_transformer, ["source", "destination"]),
        ("time", time_transformer, ["dep_time", "arrival_time"]),
        ("dur", duration_transformer, ["duration"]),
        ("stops", total_stops_transformer, ["total_stops"]),
	    ("info", info_transformer, ["additional_info"])
        
    ], remainder="passthrough")

# Selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

# Preprocessor
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

In [12]:
preprocessor.fit(
    train.drop(columns='price'), 
    train.price.copy()
)

0,1,2
,steps,"[('ct', ...), ('selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('airline', ...), ('date_of_journey', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,variables,
,features_to_extract,"['month', 'week', ...]"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,True
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,tol,0.05
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'
,smoothing,0.0

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function is_...x7f540a8c3130>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,variables,
,features_to_extract,"['hour', 'minute']"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,True
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,func,<function par...x7f540a8c31c0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,encoding_method,'count'
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,1.5
,add_indicators,False
,variables,
,missing_values,'raise'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,variables,
,percentiles,"[0.25, 0.5, ...]"
,gamma,0.1

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function dur...x7f540a8c3880>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,"[['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function is_...x7f540a8c3250>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function is_...x7f540a8c3910>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function hav...x7f540a8c39a0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,estimator,RandomForestR...ndom_state=42)
,scoring,'r2'
,cv,3
,groups,
,threshold,0.1
,variables,
,confirm_variables,False

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
print(type(preprocessor))

<class 'sklearn.pipeline.Pipeline'>


In [14]:
preprocessor.transform(train.drop(columns='price')).columns

Index(['airline__airline_Indigo', 'airline__airline_Jet Airways',
       'date_of_journey__date_of_journey_week',
       'date_of_journey__date_of_journey_day_of_year', 'dur__duration_rbf_25',
       'dur__duration_cat', 'dur__duration', 'stops__total_stops',
       'stops__is_direct_flight'],
      dtype='object')

In [15]:
len(preprocessor.transform(train.drop(columns='price')).columns)

9

## 4. Preprocess Data and Upload to Bucket

In [16]:
BUCKET_NAME = "first-bucket-flights"
DATA_PREFIX = "data"

In [17]:
def get_file_name(name):
    return f"pre--{name}.csv"

In [18]:
get_file_name("abhinay")

'pre--abhinay.csv'

In [19]:
def export_data(data, name, pre):
    
    # split data into train and test
    X = data.drop(columns='price')
    y = data.price.copy()

    # transformation
    X_pre = pre.transform(X)

    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)# sagemaker needs data in this format
    )
    


In [20]:
def upload_to_bucket(name):
  file_name = get_file_name(name)
  (   
     boto3
      .Session()
      .resource("s3")
      .Bucket(BUCKET_NAME)
      .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
      .upload_file(file_name)
      
  )

In [21]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [23]:
export_and_upload_bucket(train, "train", preprocessor)   

In [27]:
pd.read_csv('pre--train.csv')

Unnamed: 0,7711,0.0,0.0.1,0.3529411764705882,0.3305084745762712,-0.376491486586609,2.0,2.8570438782330654,2,0
0,12698,0.0,0.0,0.823529,0.796610,-0.376491,2.0,1.742285,2,0
1,7452,0.0,0.0,0.647059,0.635593,-0.376491,2.0,2.216541,1,0
2,3687,0.0,0.0,0.882353,0.898305,-0.376491,0.0,-1.010462,0,1
3,9483,0.0,1.0,0.352941,0.330508,-0.376491,2.0,1.175239,2,0
4,3687,0.0,0.0,1.000000,1.000000,-0.376491,0.0,-1.010462,0,1
...,...,...,...,...,...,...,...,...,...,...
634,15077,0.0,0.0,0.058824,0.042373,-0.376491,2.0,-0.330008,1,0
635,6258,1.0,0.0,0.411765,0.432203,-0.376491,1.0,-0.577446,1,0
636,5224,1.0,0.0,0.823529,0.847458,-0.376491,0.0,-0.969222,0,1
637,7640,1.0,0.0,0.647059,0.661017,-0.376491,2.0,-0.278458,1,0


In [28]:
export_and_upload_bucket(test, "test", preprocessor)   

In [29]:
export_and_upload_bucket(val, "val", preprocessor)   

## 5. Model and Hyper-parameter tuning setup

In [30]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [31]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [32]:
output_path = f"s3://{BUCKET_NAME}/model/output"


In [45]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.t3.medium",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=180,
    max_wait=600,
    sagemaker_session=session
)

In [46]:
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

In [47]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [48]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 6. Data Channels

In [37]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [38]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f53f93953f0>

In [39]:
val_data_channel = get_data_channel("val")


In [40]:
val_data_channel

<sagemaker.inputs.TrainingInput at 0x7f53f93954b0>

In [41]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [49]:
tuner.fit(data_channels)


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


In [50]:
region_name

'eu-north-1'