### The Data

In [1]:
import pandas as pd

In [2]:
# https://www.kaggle.com/nikhilmittal/flight-fare-prediction-mh
df = pd.read_excel('../data/flight_fare_prediction_train.xlsx')

In [3]:
df.sample(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
7564,IndiGo,6/05/2019,Mumbai,Hyderabad,BOM → HYD,21:20,22:50,1h 30m,non-stop,No info,2227
10435,Air India,12/05/2019,Kolkata,Banglore,CCU → BBI → BLR,09:10,13:15,4h 5m,1 stop,No info,8996
7876,Air India,6/06/2019,Delhi,Cochin,DEL → RPR → NAG → BOM → COK,05:15,19:15 07 Jun,38h,3 stops,No info,10703
189,Air Asia,12/06/2019,Banglore,Delhi,BLR → DEL,11:10,13:55,2h 45m,non-stop,No info,3383
8281,Air India,27/03/2019,Delhi,Cochin,DEL → BOM → COK,21:00,19:15 28 Mar,22h 15m,1 stop,No info,6692


In [4]:
df.shape

(10683, 11)

### EDA

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Price,10683.0,9087.064121,4611.359167,1759.0,5277.0,8372.0,12373.0,79512.0


In [6]:
f = ['Airline', 'Source', 'Destination', 'Route', 'Additional_Info']

for fi in f:
    print(f'{fi}:\n')
    print(df[fi].value_counts())
    print('\n')

Airline:

Jet Airways                          3849
IndiGo                               2053
Air India                            1752
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64


Source:

Delhi       4537
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: Source, dtype: int64


Destination:

Cochin       4537
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: Destination, dtype: int64


Route:

DEL → BOM → COK          2376
BLR → DEL                1552
CCU → BOM → BLR           979
CCU → BLR                 724
BOM → HYD                 621
                   

### The Cleaning

In [7]:
df.columns = [c.lower() for c in df.columns]
df['date_of_journey'] = df['date_of_journey'].apply(pd.to_datetime)
df['price'] = df['price'].apply(lambda x: round(x * 0.014)) # to USD
df['total_stops'] = df['total_stops'].apply(
    lambda x: pd.to_numeric(str(x).split(' ')[0], errors='coerce')
)
df['total_stops'] = df['total_stops'].fillna(0)

In [8]:
df[['price']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,10683.0,127.208462,64.56394,25.0,74.0,117.0,173.0,1113.0


### Select + Split

In [9]:
y = df['price']
X = df[['date_of_journey', 'source', 'destination', 'total_stops']]

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Dealing with Dates

In [12]:
X_train['date_of_journey'].head()

752    2019-05-27
7431   2019-06-05
5478   2019-12-06
3445   2019-06-06
9380   2019-06-18
Name: date_of_journey, dtype: datetime64[ns]

In [13]:
X_train['date_of_journey'].dt.month.head()

752      5
7431     6
5478    12
3445     6
9380     6
Name: date_of_journey, dtype: int64

In [14]:
X_train['date_of_journey'].dt.dayofweek.head()

752     0
7431    2
5478    4
3445    3
9380    1
Name: date_of_journey, dtype: int64

In [15]:
from sklearn.base import TransformerMixin

class DateEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        month = X.dt.month
        day_of_week = X.dt.dayofweek
        return pd.concat([month, day_of_week], axis=1)

In [16]:
DateEncoder().fit_transform(X_train['date_of_journey']).head()

Unnamed: 0,date_of_journey,date_of_journey.1
752,5,0
7431,6,2
5478,12,4
3445,6,3
9380,6,1


In [17]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer

In [18]:
X_train.head(5)

Unnamed: 0,date_of_journey,source,destination,total_stops
752,2019-05-27,Delhi,Cochin,1.0
7431,2019-06-05,Chennai,Kolkata,0.0
5478,2019-12-06,Kolkata,Banglore,1.0
3445,2019-06-06,Delhi,Cochin,1.0
9380,2019-06-18,Banglore,Delhi,0.0


In [19]:
mapper = DataFrameMapper([
    ('date_of_journey', DateEncoder(), {'input_df': True}),
    ('source', LabelBinarizer()), 
    ('destination', LabelBinarizer()),
    ('total_stops', None)
], df_out=True)

In [20]:
Z_train = mapper.fit_transform(X_train)

In [21]:
Z_test = mapper.transform(X_test)

### The Model

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
model = LinearRegression()

In [24]:
model.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
model.score(Z_train, y_train)

0.4086963860270111

In [26]:
model.score(Z_test, y_test)

0.419190048886199

In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
mean_squared_error(y_test, model.predict(Z_test))**(1/2)

48.22128370211856

In [29]:
from sklearn.dummy import DummyRegressor

dummy = DummyRegressor()
dummy.fit(Z_train, y_train)
print(dummy.score(Z_train, y_train))
print(dummy.score(Z_test, y_test))

mean_squared_error(y_test, dummy.predict(Z_test))**(1/2)

0.0
-3.1184373658454945e-05


63.27446980630515

### Pipeline

In [30]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)

In [31]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.419190048886199

#### Pickle 

In [32]:
import pickle

In [33]:
with open('pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

del pipe

In [34]:
with open('pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [35]:
X_train.sample().to_dict(orient='list')

{'date_of_journey': [Timestamp('2019-05-15 00:00:00')],
 'source': ['Delhi'],
 'destination': ['Cochin'],
 'total_stops': [1.0]}

In [36]:
new = pd.DataFrame({
    'date_of_journey': [pd.Timestamp('2019-06-15 00:00:00')],
    'source': ['Kolkata'],
    'destination': ['Banglore'],
    'total_stops': [0.0]
})

In [37]:
pipe.predict(new)[0]

82.4286701577503