### The Data

In [1]:
import pandas as pd

In [2]:
# https://www.kaggle.com/nikhilmittal/flight-fare-prediction-mh
df = pd.read_excel('data/train.xlsx')

In [3]:
df.sample(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
4223,Jet Airways,1/04/2019,Kolkata,Banglore,CCU → DEL → BLR,20:25,22:05 02 Apr,25h 40m,1 stop,No info,12121
6843,Air India,15/03/2019,Delhi,Cochin,DEL → COK,14:10,17:10,3h,non-stop,No info,5674
10093,IndiGo,15/04/2019,Banglore,Delhi,BLR → DEL,04:00,06:50,2h 50m,non-stop,No info,4423
10190,Jet Airways,1/04/2019,Banglore,Delhi,BLR → DEL,19:50,22:50,3h,non-stop,In-flight meal not included,4544
2250,Jet Airways,24/03/2019,Kolkata,Banglore,CCU → BOM → BLR,19:45,19:40 25 Mar,23h 55m,1 stop,No info,13759


In [4]:
df.shape

(10683, 11)

### EDA

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Price,10683.0,9087.064121,4611.359167,1759.0,5277.0,8372.0,12373.0,79512.0


In [6]:
f = ['Airline', 'Source', 'Destination', 'Route', 'Additional_Info']

for fi in f:
    print(f'{fi}:\n')
    print(df[fi].value_counts())
    print('\n')

Airline:

Jet Airways                          3849
IndiGo                               2053
Air India                            1752
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64


Source:

Delhi       4537
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: Source, dtype: int64


Destination:

Cochin       4537
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: Destination, dtype: int64


Route:

DEL → BOM → COK                2376
BLR → DEL                      1552
CCU → BOM → BLR                 979
CCU → BLR                       724
BOM → HYD                

### The Cleaning

In [7]:
# lower case everything
df.columns = [c.lower() for c in df.columns]
# date to date
df['date_of_journey'] = df['date_of_journey'].apply(pd.to_datetime)
# price to USD
df['price'] = df['price'].apply(lambda x: round(x * 0.014))
# stops to number
df['total_stops'] = df['total_stops'].apply(
    lambda x: pd.to_numeric(str(x).split(' ')[0], errors='coerce')
)
df['total_stops'] = df['total_stops'].fillna(0)
# rename columns 
df = df.rename(columns={
    'date_of_journey': 'date', 
    'total_stops': 'stops',
    'source': 'origin'
})

In [8]:
df[['price']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,10683.0,127.208462,64.56394,25.0,74.0,117.0,173.0,1113.0


### Select + Split

In [9]:
y = df['price']
X = df[['date', 'origin', 'destination', 'stops']]

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Dealing with Dates

In [12]:
X_train['date'].head()

752    2019-05-27
7431   2019-06-05
5478   2019-12-06
3445   2019-06-06
9380   2019-06-18
Name: date, dtype: datetime64[ns]

In [13]:
X_train['date'].dt.month.head()

752      5
7431     6
5478    12
3445     6
9380     6
Name: date, dtype: int64

In [14]:
X_train['date'].dt.dayofweek.head()

752     0
7431    2
5478    4
3445    3
9380    1
Name: date, dtype: int64

In [15]:
from sklearn.base import TransformerMixin

class DateEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        month = X.dt.month
        day_of_week = X.dt.dayofweek
        return pd.concat([month, day_of_week], axis=1)

In [16]:
DateEncoder().fit_transform(X_train['date']).head()

Unnamed: 0,date,date.1
752,5,0
7431,6,2
5478,12,4
3445,6,3
9380,6,1


In [17]:
from sklearn.preprocessing import LabelBinarizer
from sklearn_pandas import DataFrameMapper

In [18]:
X_train.head(5)

Unnamed: 0,date,origin,destination,stops
752,2019-05-27,Delhi,Cochin,1.0
7431,2019-06-05,Chennai,Kolkata,0.0
5478,2019-12-06,Kolkata,Banglore,1.0
3445,2019-06-06,Delhi,Cochin,1.0
9380,2019-06-18,Banglore,Delhi,0.0


In [19]:
mapper = DataFrameMapper([
    ('date', DateEncoder(), {'input_df': True}),
    ('origin', LabelBinarizer()), 
    ('destination', LabelBinarizer()),
    ('stops', None)
], df_out=True)

In [20]:
Z_train = mapper.fit_transform(X_train)

In [21]:
Z_test = mapper.transform(X_test)

### The Model

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
model = LinearRegression()

In [24]:
model.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
model.score(Z_train, y_train)

0.4086963860270111

In [26]:
model.score(Z_test, y_test)

0.419190048886199

In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
mean_squared_error(y_test, model.predict(Z_test))**(1/2)

48.22128370211856

In [29]:
from sklearn.dummy import DummyRegressor

dummy = DummyRegressor()
dummy.fit(Z_train, y_train)
print(dummy.score(Z_train, y_train))
print(dummy.score(Z_test, y_test))

mean_squared_error(y_test, dummy.predict(Z_test))**(1/2)

0.0
-3.1184373658454945e-05


63.27446980630515

### Pipeline

In [30]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(mapper, model)

In [31]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.419190048886199

In [32]:
X_train.sample().to_dict(orient='list')

{'date': [Timestamp('2019-06-03 00:00:00')],
 'origin': ['Chennai'],
 'destination': ['Kolkata'],
 'stops': [0.0]}

In [33]:
new = pd.DataFrame({
    'date': [pd.Timestamp('2019-06-15 00:00:00')],
    'origin': ['Kolkata'],
    'destination': ['Banglore'],
    'stops': [0.0]
})

In [34]:
pipe.predict(new)

array([82.42867016])

#### Pickle 

In [35]:
import pickle

In [36]:
with open('pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

del pipe

In [37]:
with open('pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [38]:
pipe.predict(new)[0]

82.4286701577503