In [1]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import ConnectionPatch
from collections import OrderedDict
from matplotlib.gridspec import GridSpec
from sklearn import metrics, linear_model
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from scipy.optimize import curve_fit
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
airline = pd.read_csv('1995.csv')
airline.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1995,1,6,5,657.0,645,952.0,937,UA,482,N7298U,115.0,112.0,83.0,15.0,12.0,ORD,PHL,678.0,7,25,0,,0,,,,,
1,1995,1,7,6,648.0,645,938.0,937,UA,482,N7449U,110.0,112.0,88.0,1.0,3.0,ORD,PHL,678.0,5,17,0,,0,,,,,
2,1995,1,8,7,649.0,645,932.0,937,UA,482,N7453U,103.0,112.0,83.0,-5.0,4.0,ORD,PHL,678.0,3,17,0,,0,,,,,
3,1995,1,9,1,645.0,645,928.0,937,UA,482,N7288U,103.0,112.0,84.0,-9.0,0.0,ORD,PHL,678.0,3,16,0,,0,,,,,
4,1995,1,10,2,645.0,645,931.0,937,UA,482,N7275U,106.0,112.0,82.0,-6.0,0.0,ORD,PHL,678.0,6,18,0,,0,,,,,


In [3]:
airline['Day']=airline['DayofMonth']

In [4]:
airline['Date'] = pd.to_datetime(airline[['Year','Month', 'Day']])

In [5]:
# Function that convert the 'HHMM' string to datetime.time
def format_hour(time):
    if pd.isnull(time):
        return np.nan
    else:
        if time == 2400: time = 0
        time = "{0:04d}".format(int(time))
        hour = datetime.time(int(time[0:2]), int(time[2:4]))
        return hour
    
# Function that combines a date and time to produce a datetime.datetime
def combine_date_hour(x):
    if pd.isnull(x[0]) or pd.isnull(x[1]):
        return np.nan
    else:
        return datetime.datetime.combine(x[0],x[1])


In [6]:
airline['CRSDepTime'] = airline['CRSDepTime'].apply(format_hour)
airline['DepTime'] = airline['DepTime'].apply(format_hour)
airline['CRSArrTime'] = airline['CRSArrTime'].apply(format_hour)
airline['ArrTime'] = airline['ArrTime'].apply(format_hour)

In [7]:
airline.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Day,Date
0,1995,1,6,5,06:57:00,06:45:00,09:52:00,09:37:00,UA,482,N7298U,115.0,112.0,83.0,15.0,12.0,ORD,PHL,678.0,7,25,0,,0,,,,,,6,1995-01-06
1,1995,1,7,6,06:48:00,06:45:00,09:38:00,09:37:00,UA,482,N7449U,110.0,112.0,88.0,1.0,3.0,ORD,PHL,678.0,5,17,0,,0,,,,,,7,1995-01-07
2,1995,1,8,7,06:49:00,06:45:00,09:32:00,09:37:00,UA,482,N7453U,103.0,112.0,83.0,-5.0,4.0,ORD,PHL,678.0,3,17,0,,0,,,,,,8,1995-01-08
3,1995,1,9,1,06:45:00,06:45:00,09:28:00,09:37:00,UA,482,N7288U,103.0,112.0,84.0,-9.0,0.0,ORD,PHL,678.0,3,16,0,,0,,,,,,9,1995-01-09
4,1995,1,10,2,06:45:00,06:45:00,09:31:00,09:37:00,UA,482,N7275U,106.0,112.0,82.0,-6.0,0.0,ORD,PHL,678.0,6,18,0,,0,,,,,,10,1995-01-10


In [8]:
airline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5327435 entries, 0 to 5327434
Data columns (total 31 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              object
CRSDepTime           object
ArrTime              object
CRSArrTime           object
UniqueCarrier        object
FlightNum            int64
TailNum              object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             float64
TaxiIn               int64
TaxiOut              int64
Cancelled            int64
CancellationCode     float64
Diverted             int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
Day                  int64
Date                 datetime64[ns]
dtypes: d

In [9]:
airline = airline[['UniqueCarrier','Origin','DayOfWeek','Dest','Distance', 'CRSDepTime','DepTime','DepDelay',
                  'CRSArrTime','ArrTime','ArrDelay','CRSElapsedTime','ActualElapsedTime']]

In [10]:
airline.head()

Unnamed: 0,UniqueCarrier,Origin,DayOfWeek,Dest,Distance,CRSDepTime,DepTime,DepDelay,CRSArrTime,ArrTime,ArrDelay,CRSElapsedTime,ActualElapsedTime
0,UA,ORD,5,PHL,678.0,06:45:00,06:57:00,12.0,09:37:00,09:52:00,15.0,112.0,115.0
1,UA,ORD,6,PHL,678.0,06:45:00,06:48:00,3.0,09:37:00,09:38:00,1.0,112.0,110.0
2,UA,ORD,7,PHL,678.0,06:45:00,06:49:00,4.0,09:37:00,09:32:00,-5.0,112.0,103.0
3,UA,ORD,1,PHL,678.0,06:45:00,06:45:00,0.0,09:37:00,09:28:00,-9.0,112.0,103.0
4,UA,ORD,2,PHL,678.0,06:45:00,06:45:00,0.0,09:37:00,09:31:00,-6.0,112.0,106.0


In [11]:
missing = airline.isnull().sum(axis=0).reset_index()
missing.columns = ['variable', 'missing values']
missing['filling factor (%)']=(airline.shape[0]-missing['missing values'])/airline.shape[0]*100
missing.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,ArrTime,102397,98.077931
1,ArrDelay,102397,98.077931
2,ActualElapsedTime,102397,98.077931
3,DepTime,91905,98.274873
4,DepDelay,91905,98.274873
5,Distance,5987,99.887619
6,CRSElapsedTime,3249,99.939014
7,UniqueCarrier,0,100.0
8,Origin,0,100.0
9,DayOfWeek,0,100.0


In [12]:
airline.dropna(inplace = True)

In [13]:
def create_df(df, carrier):
    airline2 = airline[airline['UniqueCarrier'] == carrier][['CRSDepTime','CRSArrTime',
                                    'Origin','Dest','DepDelay','DayOfWeek']]
    airline2.dropna(how = 'any', inplace = True)
    #airline2['weekday'] = airline2['CRSDepTime'].apply(lambda x:x.weekday())
    
    # delete delays > 1h
    #airline2['DepDelay'] = airline2['DepDelay'].apply(lambda x:x if x < 60 else np.nan)
    #airline2.dropna(how = 'any', inplace = True)
    
    # formating times
    fct = lambda x:x.hour*3600+x.minute*60+x.second
    airline2['time_depart'] = airline2['CRSDepTime'].apply(fct)
    airline2['time_arrive'] = airline2['CRSArrTime'].apply(fct)
    airline3 = airline2.groupby(['time_depart', 'time_arrive', 'Origin'],
                      as_index = False).mean()
    return airline3

In [15]:
carrier = 'AA'
airline3 = create_df(airline, carrier)    
airline3.head()

Unnamed: 0,time_depart,time_arrive,Origin,DepDelay,DayOfWeek
0,0,0,ABQ,6.754072,3.972313
1,0,0,ACK,-1.0,3.0
2,0,0,ALB,7.712707,3.928177
3,0,0,AMA,7.9375,3.994792
4,0,0,ATL,10.729124,3.902919


In [16]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(airline3['Origin'])

zipped = zip(integer_encoded, airline3['Origin'])
label_airports = list(set(list(zipped)))
label_airports.sort(key = lambda x:x[0])

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

b = np.array(airline3[['time_depart', 'time_arrive']])
X = np.hstack((onehot_encoded, b))
Y = np.array(airline3['DepDelay'])
Y = Y.reshape(len(Y), 1)

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
from sklearn.linear_model import Ridge

In [20]:
score_min = 10000
for pol_order in range(1, 3):
    for alpha in range(0, 20, 2):
        ridgereg = Ridge(alpha = alpha/10, normalize=True)
        poly = PolynomialFeatures(degree = pol_order)
        regr = linear_model.LinearRegression()
        X_ = poly.fit_transform(X_train)
        ridgereg.fit(X_, Y_train)
        
        X_ = poly.fit_transform(X_test)
        result = ridgereg.predict(X_)
        score = metrics.mean_squared_error(result, Y_test)
        
        if score < score_min:
            score_min = score
            parameters = [alpha, pol_order]

        print("n={} alpha={} , MSE = {:<0.5}".format(pol_order, alpha/10, score))

n=1 alpha=0.0 , MSE = 2.5023e+19
n=1 alpha=0.2 , MSE = 92.219
n=1 alpha=0.4 , MSE = 92.76
n=1 alpha=0.6 , MSE = 93.377
n=1 alpha=0.8 , MSE = 94.012
n=1 alpha=1.0 , MSE = 94.635
n=1 alpha=1.2 , MSE = 95.233
n=1 alpha=1.4 , MSE = 95.801
n=1 alpha=1.6 , MSE = 96.337
n=1 alpha=1.8 , MSE = 96.839
n=2 alpha=0.0 , MSE = 3.2947e+25
n=2 alpha=0.2 , MSE = 91.096
n=2 alpha=0.4 , MSE = 91.223
n=2 alpha=0.6 , MSE = 91.382
n=2 alpha=0.8 , MSE = 91.559
n=2 alpha=1.0 , MSE = 91.749
n=2 alpha=1.2 , MSE = 91.948
n=2 alpha=1.4 , MSE = 92.156
n=2 alpha=1.6 , MSE = 92.368
n=2 alpha=1.8 , MSE = 92.585


In [21]:
ridgereg = Ridge(alpha = parameters[0], normalize=True)
poly = PolynomialFeatures(degree = parameters[1])
X_ = poly.fit_transform(X)
ridgereg.fit(X_, Y)
result = ridgereg.predict(X_)
score = metrics.mean_squared_error(result, Y)        
print(score)

71.92299767867281
