# 1.Import Libraries

In [67]:
import numpy as np
import pandas as pd

import warnings

import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import(
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer
)

from sklearn.impute import SimpleImputer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.datetime import DatetimeFeatures
from feature_engine.encoding import MeanEncoder

# from sklearn.compose import ColumnTransformer


# 2. Display Settings

In [2]:
pd.set_option("display.max_columns", None)   # display all columns

In [3]:
sklearn.set_config(transform_output = "pandas")

In [5]:
warnings.filterwarnings("ignore")

# 3.Read the Data

In [8]:
path = r"D:\Flights-sagemaker-project\data\train.csv"

In [9]:
train = pd.read_csv(path)

In [10]:
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-03-01,Delhi,Cochin,00:20:00,15:30:00,910,1,No Info,23170
1,Air Asia,2019-03-24,Banglore,New Delhi,05:50:00,08:40:00,170,0,No Info,3383
2,Jet Airways,2019-05-27,Delhi,Cochin,10:00:00,12:35:00,1595,1,In-flight meal not included,12898
3,Indigo,2019-03-06,Chennai,Kolkata,13:20:00,15:35:00,135,0,No Info,7295
4,Jet Airways,2019-05-09,Banglore,Delhi,18:55:00,22:00:00,185,0,In-flight meal not included,3702


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7322 entries, 0 to 7321
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   airline          7322 non-null   object
 1   date_of_journey  7322 non-null   object
 2   source           7322 non-null   object
 3   destination      7322 non-null   object
 4   dep_time         7322 non-null   object
 5   arrival_time     7322 non-null   object
 6   duration         7322 non-null   int64 
 7   total_stops      7322 non-null   int64 
 8   additional_info  7322 non-null   object
 9   price            7322 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 572.2+ KB


In [12]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

# 4. Transformation Operations

### 4.1 airline

In [26]:
X_train.airline

0       Multiple Carriers
1                Air Asia
2             Jet Airways
3                  Indigo
4             Jet Airways
              ...        
7317            Air India
7318          Jet Airways
7319          Jet Airways
7320               Indigo
7321    Multiple Carriers
Name: airline, Length: 7322, dtype: object

In [27]:
air_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("grouper", RareLabelEncoder(tol = 0.1, replace_with="other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_other
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
7317,1.0,0.0,0.0,0.0,0.0
7318,0.0,0.0,1.0,0.0,0.0
7319,0.0,0.0,1.0,0.0,0.0
7320,0.0,1.0,0.0,0.0,0.0


### 4.2 date_of_journey

In [37]:
X_train.date_of_journey

0       2019-03-01
1       2019-03-24
2       2019-05-27
3       2019-03-06
4       2019-05-09
           ...    
7317    2019-03-03
7318    2019-03-27
7319    2019-04-01
7320    2019-06-12
7321    2019-06-24
Name: date_of_journey, Length: 7322, dtype: object

In [46]:
feature_to_extract = ["month","week","day_of_week","day_of_year"]

In [47]:
doj_transformer = Pipeline(steps =[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])
doj_transformer.fit_transform(X_train.loc[:,["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,0.000000,0.000000,0.666667,0.000000
1,0.000000,0.176471,1.000000,0.194915
2,0.666667,0.764706,0.000000,0.737288
3,0.000000,0.058824,0.333333,0.042373
4,0.666667,0.588235,0.500000,0.584746
...,...,...,...,...
7317,0.000000,0.000000,1.000000,0.016949
7318,0.000000,0.235294,0.333333,0.220339
7319,0.333333,0.294118,0.000000,0.262712
7320,1.000000,0.882353,0.333333,0.872881


### 4.3 source & destination

In [48]:
X_train.source

0          Delhi
1       Banglore
2          Delhi
3        Chennai
4       Banglore
          ...   
7317       Delhi
7318       Delhi
7319     Kolkata
7320       Delhi
7321       Delhi
Name: source, Length: 7322, dtype: object

In [50]:
X_train.destination.value_counts("")

destination
Cochin       3005
Banglore     1998
Delhi         898
New Delhi     669
Hyderabad     489
Kolkata       263
Name: count, dtype: int64

In [52]:
location_subset = X_train.loc[:,["source", "destination"]]
location_subset

Unnamed: 0,source,destination
0,Delhi,Cochin
1,Banglore,New Delhi
2,Delhi,Cochin
3,Chennai,Kolkata
4,Banglore,Delhi
...,...,...
7317,Delhi,Cochin
7318,Delhi,Cochin
7319,Kolkata,Banglore
7320,Delhi,Cochin


In [58]:
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

location_pipe1.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,1.059379,1.056464
1,-0.859973,-0.761119
2,1.059379,1.056464
3,-1.897255,-0.761119
4,-0.859973,-1.842528
...,...,...
7317,1.059379,1.056464
7318,1.059379,1.056464
7319,-0.204766,-0.219487
7320,1.059379,1.056464


In [59]:
np.union1d(
    X_train.source.unique(),
    X_train.destination.unique()
)

array(['Banglore', 'Chennai', 'Cochin', 'Delhi', 'Hyderabad', 'Kolkata',
       'Mumbai', 'New Delhi'], dtype=object)

In [66]:
def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi","Kolkata","Mumbai","New Delhi"]
    return  (
        X
        .assign(**{
            f"{col}_isnorth": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns = columns)
    )

FunctionTransformer(func = is_north).fit_transform(location_subset)

Unnamed: 0,source_isnorth,destination_isnorth
0,1,0
1,0,1
2,1,0
3,0,1
4,0,1
...,...,...
7317,1,0
7318,1,0
7319,1,0
7320,1,0


In [70]:
# on same set of variable if you want to perform mulitple operation then you can use FeatuerUnion 
location_transformer = FeatureUnion(transformer_list = [
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func = is_north))
])

location_transformer.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination,source_isnorth,destination_isnorth
0,1.059379,1.056464,1,0
1,-0.859973,-0.761119,0,1
2,1.059379,1.056464,1,0
3,-1.897255,-0.761119,0,1
4,-0.859973,-1.842528,0,1
...,...,...,...,...
7317,1.059379,1.056464,1,0
7318,1.059379,1.056464,1,0
7319,-0.204766,-0.219487,1,0
7320,1.059379,1.056464,1,0
