# This Project aims to predict the estimated duration of flight delays per flight.
The accurate prediction of flight delays will help all players in the air travel ecosystem
to set up effective action plans to reduce the impact of the delays and avoid loss of time, capital and resources


# Loading the necessary dependancies/Libraries                                                                                                     

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA,KernelPCA
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFECV
import math
from sklearn.ensemble import RandomForestRegressor

# Loading the datasets

In [2]:
training_data = pd.read_csv('D:/ML/Project-Flight Delay/Train.csv')
testing_data = pd.read_csv('D:/ML/Project-Flight Delay/Test.csv')

In [3]:
# view the first 5 training observations
training_data.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0


In [4]:
training_data['DATOP'].dtype

dtype('O')

In [5]:
# view the first 5 testing observations
testing_data.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
0,test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF
1,test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW
2,test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC
3,test_id_3,2016-05-11,TU 0848,BEY,TUN,2016-05-11 09:40:00,2016-05-11 13.10.00,ATA,TU 31BIMO
4,test_id_4,2016-05-11,TU 0635,ORY,MIR,2016-05-11 09:50:00,2016-05-11 12.35.00,ATA,TU 736IOQ


# Feature Engineering/ data Wrangling

In [6]:
# Checking for missing values
training_data.isnull().sum()

ID        0
DATOP     0
FLTID     0
DEPSTN    0
ARRSTN    0
STD       0
STA       0
STATUS    0
AC        0
target    0
dtype: int64

In [7]:
testing_data.isnull().sum()

ID        0
DATOP     0
FLTID     0
DEPSTN    0
ARRSTN    0
STD       0
STA       0
STATUS    0
AC        0
dtype: int64

In [8]:
# Changing the date and time format of the STA
# # training_data['STD'] = pd.to_datetime(training_data['STD']).dt.time
# training_data['STA'] = pd.to_datetime(training_data['STA'], format = '%Y-%m-%d %H.%M.%S')
# # testing_data['STD'] = pd.to_datetime(testing_data['STD']).dt.time
# testing_data['STA'] = pd.to_datetime(testing_data['STA'], format = '%Y-%m-%d %H.%M.%S')

Set ID as the index 

In [9]:
training_data.set_index('ID', inplace = True)
testing_data.set_index('ID', inplace = True)

In [10]:
training_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0


In [11]:
testing_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF
test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW
test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC


In [12]:
# # Generating Actual time of Departure and Arrival features
# training_data['ATD'] = pd.to_datetime(training_data['STD']) + pd.to_timedelta(training_data['target'], unit = 'm')
# training_data['ATA'] = pd.to_datetime(training_data['STA']) + pd.to_timedelta(training_data['target'], unit = 'm')
# training_data.head(3)

Generating new features ie month , day , time  in relation with  departure  point 

In [13]:
# month 
training_data['DM'] = pd.to_datetime(training_data['STD']).dt.month 
# day 
training_data['DD'] = pd.to_datetime(training_data['STD']).dt.day 
# time
training_data['DT'] = pd.to_datetime(training_data['STD']).dt.time 
training_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,DM,DD,DT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0,1,3,10:30:00
train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0,1,13,15:05:00
train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0,1,16,04:10:00


In [14]:
# month 
testing_data['DM'] = pd.to_datetime(testing_data['STD']).dt.month 
# day 
testing_data['DD'] = pd.to_datetime(testing_data['STD']).dt.day 
# time
testing_data['DT'] = pd.to_datetime(testing_data['STD']).dt.time 
testing_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,DM,DD,DT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF,5,4,06:40:00
test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW,5,5,15:20:00
test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC,5,6,10:00:00


Generating new features ie month , day , time  in relation with  Arrival  point

In [15]:
# Changing the date and time format of the STA
training_data['STA'] = pd.to_datetime(training_data['STA'], format = '%Y-%m-%d %H.%M.%S')
testing_data['STA'] = pd.to_datetime(testing_data['STA'], format = '%Y-%m-%d %H.%M.%S')

In [16]:
# month 
training_data['AM'] = pd.to_datetime(training_data['STA']).dt.month 
# day 
training_data['AD'] = pd.to_datetime(training_data['STA']).dt.day 
# time
training_data['AT'] = pd.to_datetime(training_data['STA']).dt.time 
training_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,DM,DD,DT,AM,AD,AT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12:55:00,ATA,TU 32AIMN,260.0,1,3,10:30:00,1,3,12:55:00
train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16:55:00,ATA,TU 31BIMO,20.0,1,13,15:05:00,1,13,16:55:00
train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06:45:00,ATA,TU 32AIMN,0.0,1,16,04:10:00,1,16,06:45:00


In [17]:
# month 
testing_data['AM'] = pd.to_datetime(testing_data['STA']).dt.month 
# day 
testing_data['AD'] = pd.to_datetime(testing_data['STA']).dt.day 
# time
testing_data['AT'] = pd.to_datetime(testing_data['STA']).dt.time
testing_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,DM,DD,DT,AM,AD,AT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07:30:00,ATA,TU 32AIMF,5,4,06:40:00,5,4,07:30:00
test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20:05:00,ATA,TU 320IMW,5,5,15:20:00,5,5,20:05:00
test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12:25:00,ATA,TU 32AIMC,5,6,10:00:00,5,6,12:25:00


Generating a feature TRIP  from  departure and arrival point 

In [18]:
training_data['TRIP'] = training_data.DEPSTN + training_data.ARRSTN
testing_data['TRIP'] = testing_data.DEPSTN + testing_data.ARRSTN

training_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,DM,DD,DT,AM,AD,AT,TRIP
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12:55:00,ATA,TU 32AIMN,260.0,1,3,10:30:00,1,3,12:55:00,CMNTUN
train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16:55:00,ATA,TU 31BIMO,20.0,1,13,15:05:00,1,13,16:55:00,MXPTUN
train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06:45:00,ATA,TU 32AIMN,0.0,1,16,04:10:00,1,16,06:45:00,TUNIST


In [19]:
testing_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,DM,DD,DT,AM,AD,AT,TRIP
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07:30:00,ATA,TU 32AIMF,5,4,06:40:00,5,4,07:30:00,DJETUN
test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20:05:00,ATA,TU 320IMW,5,5,15:20:00,5,5,20:05:00,TUNBKO
test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12:25:00,ATA,TU 32AIMC,5,6,10:00:00,5,6,12:25:00,FRATUN


Generating feature Travel time(TT) from Departure and Arrival time

In [20]:
# training_data['TT'] = pd.to_datetime(training_data['STA']) - pd.to_datetime(training_data['STD'])
# testing_data['TT'] = pd.to_datetime(testing_data['STA']) - pd.to_datetime(testing_data['STD'])

In [21]:
# training_data.head(3)

In [22]:
testing_data.head(3)

Unnamed: 0_level_0,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,DM,DD,DT,AM,AD,AT,TRIP
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07:30:00,ATA,TU 32AIMF,5,4,06:40:00,5,4,07:30:00,DJETUN
test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20:05:00,ATA,TU 320IMW,5,5,15:20:00,5,5,20:05:00,TUNBKO
test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12:25:00,ATA,TU 32AIMC,5,6,10:00:00,5,6,12:25:00,FRATUN


Dropping date features from the dataset

In [23]:
training_data = training_data.drop(training_data[['DATOP','STD','STA']],axis = 1)
testing_data = testing_data.drop(testing_data[['DATOP','STD','STA']],axis = 1)
training_data.head(3)

Unnamed: 0_level_0,FLTID,DEPSTN,ARRSTN,STATUS,AC,target,DM,DD,DT,AM,AD,AT,TRIP
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
train_id_0,TU 0712,CMN,TUN,ATA,TU 32AIMN,260.0,1,3,10:30:00,1,3,12:55:00,CMNTUN
train_id_1,TU 0757,MXP,TUN,ATA,TU 31BIMO,20.0,1,13,15:05:00,1,13,16:55:00,MXPTUN
train_id_2,TU 0214,TUN,IST,ATA,TU 32AIMN,0.0,1,16,04:10:00,1,16,06:45:00,TUNIST


Handling Categorical data

In [24]:
# Categorical boolean mask
categorical_feature_mask = training_data.dtypes == object
categorical_cols = training_data.columns[categorical_feature_mask].tolist()
categorical_cols

['FLTID', 'DEPSTN', 'ARRSTN', 'STATUS', 'AC', 'DT', 'AT', 'TRIP']

In [25]:
label_encoder = LabelEncoder()
training_data[categorical_cols] = training_data[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))
testing_data[categorical_cols] = testing_data[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))
training_data.head(3)

Unnamed: 0_level_0,FLTID,DEPSTN,ARRSTN,STATUS,AC,target,DM,DD,DT,AM,AD,AT,TRIP
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
train_id_0,230,31,119,0,46,260.0,1,3,241,1,3,304,119
train_id_1,257,86,119,0,28,20.0,1,13,346,1,13,398,416
train_id_2,86,123,58,0,46,0.0,1,16,77,1,16,149,658


In [26]:
testing_data.shape

(9333, 12)


# Visualization

In [27]:
# sb.catplot(x = 'DM', y = 'target',data = training_data)

In [28]:
# sb.boxplot(x = 'DM', y = 'target',data = training_data)

In [29]:
# sb.barplot(x = 'DM', y = 'target',data = training_data)

In [30]:
# sb.distplot(training_data['target'][:50])

In [31]:
# sb.pairplot(training_data[:50])

# Model Selection

Splitting Training data into Features and the target

In [32]:
# features_train = training_data.drop(training_data[['target']],axis = 1)
# target_train = training_data['target']
# features_train.head()
training_data = training_data[['FLTID','DEPSTN','ARRSTN','STATUS','AC','TRIP','DM','DD','DT','AM','AD','AT','target']]
features_train = training_data.iloc[:,:-1]
target_train = training_data.iloc[:,-1].values
features_train.head()

Unnamed: 0_level_0,FLTID,DEPSTN,ARRSTN,STATUS,AC,TRIP,DM,DD,DT,AM,AD,AT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
train_id_0,230,31,119,0,46,119,1,3,241,1,3,304
train_id_1,257,86,119,0,28,416,1,13,346,1,13,398
train_id_2,86,123,58,0,46,658,1,16,77,1,16,149
train_id_3,164,37,91,0,50,163,1,17,328,1,17,399
train_id_4,136,123,4,0,34,613,1,17,335,1,17,371


In [33]:
# target_train = target_train.transpose()
target_train = target_train.reshape(-1,1)
target_train

array([[260.],
       [ 20.],
       [  0.],
       ...,
       [  0.],
       [  0.],
       [  0.]])

Feature Scaling using StandardScaler

In [34]:
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(testing_data)
target_train = scaler.fit_transform(target_train)
# features_test = scaler.fit_transform(testing_data)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [35]:
# scaler = Normalizer()
# features_train = scaler.fit_transform(features_train)
# # target_train = scaler.fit_transform(target_train)
# # features_test = scaler.transform(testing_data)

In [36]:
features_train

array([[-0.49064802, -1.51453831,  0.91527015, ..., -1.72577113,
        -1.45803275, -0.06224268],
       [-0.44469282, -0.0568305 ,  0.91527015, ..., -1.72577113,
        -0.33257286,  0.67288389],
       [-0.73574241,  0.92380929, -0.76911262, ..., -1.72577113,
         0.00506511, -1.27441948],
       ...,
       [-0.75446489,  0.92380929,  0.91527015, ...,  1.31174223,
        -1.0078488 , -0.07788367],
       [ 2.09986346,  0.92380929, -1.3489821 , ..., -1.72577113,
         0.79288703,  0.99352421],
       [ 2.08965119,  0.92380929, -1.3489821 , ...,  1.31174223,
        -0.33257286, -1.21185552]])

In [37]:
target_train.shape

(107833, 1)

Dimensionality reduction using feature extraction

In [38]:
# # Create a PCA that will retain 99% of variance
pca = PCA(n_components= 0.99,whiten=True)
# Conduct PCA
features_pca = pca.fit_transform(features_train)
features_test = pca.transform(features_train)
print(features_pca.shape)
print(features_train.shape)

(107833, 9)
(107833, 12)


In [39]:
# # Apply kernal PCA with radius basis function (RBF) kernel
# kpca = KernelPCA(kernel="rbf", gamma=15, n_components=1)
# features_pca = kpca.fit_transform(features_train)
# features_test = kpca.transform(features_train)
# print(features_pca.shape)
# print(features_train.shape)

In [40]:
# pca.explained_variance_ratio_

Selecting Best Models Using Exhaustive Search

1.Using scikit-learn’s GridSearchCV

In [41]:
# # Fitting a Nonlinear Relationship
# # Create polynomial features x^2 and x^3
# polynomial = PolynomialFeatures(degree=2, include_bias=False)
# features_pca = polynomial.fit_transform(features_pca)

In [42]:
# # Reducing Variance with Regularization
# # Create ridge regression with an alpha value
# regression = Ridge(alpha=0.5)
# # Fit the linear regression
# model = regression.fit(features_pca, target_train)
# create Random Forest Regressor
model = RandomForestRegressor()
model.fit(features_pca, target_train)


  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [43]:
# # # # Create linear regression
# # # model = LinearRegression()
# # # parameters = [{'fit_intercept':[True,False], 'copy_X':[True,False],'normalize': [True,False]}]
# # # Create grid search
# # # gridsearch = GridSearchCV(estimator = model, param_grid = parameters,scoring = 'r2', n_jobs = -1 )
# gridsearch = GridSearchCV(estimator = model, param_grid = {'max_depth':range(3,7),'n_estimators':(10,50)},cv = 10,scoring = 'neg_mean_squared_error',verbose = 0, n_jobs = -1)

# # Fit grid search
# best_model = gridsearch.fit(features_pca, target_train)
# # best_model = gridsearch.fit(features_high_variance, target_train)
# best_model.best_params_

# Model Evaluation

Cross-Validating Models

In [44]:
# # Create cross-validation
# # model = LinearRegression()
kf = KFold(n_splits=10, shuffle=True, random_state=1)
# Conduct k-fold cross-validation
cv_results = cross_val_score(model,# model
features_pca, # Feature matrix
target_train, # Target vector # 
scoring="neg_mean_absolute_error",
cv = kf,# Loss function                           
n_jobs=-1) # Use all CPU scores

In [45]:
# model = LinearRegression()
# model.fit(features_pca,target_train)

In [46]:
y_pred = model.predict(features_test)
y_pred

array([ 1.31529399, -0.2000513 , -0.27688571, ..., -0.41604136,
       -0.41604136, -0.41604136])

In [47]:
cv_results

array([-0.46718428, -0.47699657, -0.46026536, -0.48134491, -0.47496082,
       -0.47065078, -0.46580389, -0.48623248, -0.47503843, -0.47373734])

In [48]:
cv_results.std()

0.0072253582070497795

In [49]:
# # Create cross-validation
# # model = LinearRegression()
kf = KFold(n_splits=10, shuffle=True, random_state=1)
# Conduct k-fold cross-validation
cv_results = cross_val_score(model,# model
features_pca, # Feature matrix
target_train, # Target vector # 
scoring="r2",
cv = kf,# Loss function                           
n_jobs=-1) # Use all CPU scores

In [50]:
# cv_results.mean()

In [51]:
# cv_results.std()

In [52]:
# cv_results

In [53]:
metrics.mean_absolute_error(target_train,y_pred)

0.18438639866630951

In [54]:
metrics.mean_squared_error(target_train,y_pred)

0.1865153984244752

In [55]:
metrics.r2_score(target_train,y_pred)

0.8134846015755248

In [56]:
rmse = math.sqrt(metrics.mean_squared_error(target_train,y_pred))

In [57]:
rmse

0.4318742854401906

In [58]:
target_train

array([[ 1.80361935],
       [-0.24529823],
       [-0.41604136],
       ...,
       [-0.41604136],
       [-0.41604136],
       [-0.41604136]])

In [59]:
y_pred

array([ 1.31529399, -0.2000513 , -0.27688571, ..., -0.41604136,
       -0.41604136, -0.41604136])

In [60]:
model.predict([[1,2,-1,0.1,2,1,3,1,2]])

array([-0.13773006])

In [61]:
model.score(features_test,y_pred)

1.0