In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r"C:\Users\nstow\Desktop\Python\1992.csv")

In [3]:
## Look at the first few rows
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1992,1,2,4,748.0,750,851.0,846,US,53,...,,,0,,0,,,,,
1,1992,1,3,5,750.0,750,843.0,846,US,53,...,,,0,,0,,,,,
2,1992,1,4,6,747.0,750,843.0,846,US,53,...,,,0,,0,,,,,
3,1992,1,5,7,750.0,750,850.0,846,US,53,...,,,0,,0,,,,,
4,1992,1,6,1,752.0,750,838.0,846,US,53,...,,,0,,0,,,,,


In [4]:
## examine the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5092157 entries, 0 to 5092156
Data columns (total 29 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
TailNum              float64
ActualElapsedTime    float64
CRSElapsedTime       int64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             float64
TaxiIn               float64
TaxiOut              float64
Cancelled            int64
CancellationCode     float64
Diverted             int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
dtypes: float64(16), int64(10), object(3)
memory usage: 1.1+ GB


In [5]:
## check for empty columns
df.count()

Year                 5092157
Month                5092157
DayofMonth           5092157
DayOfWeek            5092157
DepTime              5039321
CRSDepTime           5092157
ArrTime              5027937
CRSArrTime           5092157
UniqueCarrier        5092157
FlightNum            5092157
TailNum                    0
ActualElapsedTime    5027937
CRSElapsedTime       5092157
AirTime                    0
ArrDelay             5027937
DepDelay             5039321
Origin               5092157
Dest                 5092157
Distance             5084770
TaxiIn                     0
TaxiOut                    0
Cancelled            5092157
CancellationCode           0
Diverted             5092157
CarrierDelay               0
WeatherDelay               0
NASDelay                   0
SecurityDelay              0
LateAircraftDelay          0
dtype: int64

In [6]:
## Remove empty columns and double check that all were removed
df = df.drop(columns = ['TailNum', 'AirTime', 'TaxiIn', 'TaxiOut', 'CancellationCode', 'CarrierDelay',\
                       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'])
df.count()

Year                 5092157
Month                5092157
DayofMonth           5092157
DayOfWeek            5092157
DepTime              5039321
CRSDepTime           5092157
ArrTime              5027937
CRSArrTime           5092157
UniqueCarrier        5092157
FlightNum            5092157
ActualElapsedTime    5027937
CRSElapsedTime       5092157
ArrDelay             5027937
DepDelay             5039321
Origin               5092157
Dest                 5092157
Distance             5084770
Cancelled            5092157
Diverted             5092157
dtype: int64

In [7]:
## Drop rows containing non-number and verifty that all columns have the same number of rows
df = df.dropna()
df.count()

Year                 5020651
Month                5020651
DayofMonth           5020651
DayOfWeek            5020651
DepTime              5020651
CRSDepTime           5020651
ArrTime              5020651
CRSArrTime           5020651
UniqueCarrier        5020651
FlightNum            5020651
ActualElapsedTime    5020651
CRSElapsedTime       5020651
ArrDelay             5020651
DepDelay             5020651
Origin               5020651
Dest                 5020651
Distance             5020651
Cancelled            5020651
Diverted             5020651
dtype: int64

In [8]:
## Create new encoded column for UniqueCarrier that has only numeric values for use with regression
y = df['UniqueCarrier']
le = preprocessing.LabelEncoder()
le.fit(y)

list(le.classes_)

df['UniqueCarrierEncoded'] = le.transform(y)

In [9]:
## Create new encoded column for Origin that has only numeric values for use with regression
y = df['Origin']
le = preprocessing.LabelEncoder()
le.fit(y)

list(le.classes_)

df['OriginEncoded'] = le.transform(y)

In [10]:
## Create new encoded column for Dest that has only numeric values for use with regression
y = df['Dest']
le = preprocessing.LabelEncoder()
le.fit(y)

list(le.classes_)

df['DestEncoded'] = le.transform(y)

In [11]:
## Create new column with only the values of the late flights or 0 for non-late flights
df['Late'] = np.where(df['ArrDelay'] > 29, df['ArrDelay'], 0)

In [12]:
## ensure new columns were created and data types are correct
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5020651 entries, 0 to 5092156
Data columns (total 23 columns):
Year                    int64
Month                   int64
DayofMonth              int64
DayOfWeek               int64
DepTime                 float64
CRSDepTime              int64
ArrTime                 float64
CRSArrTime              int64
UniqueCarrier           object
FlightNum               int64
ActualElapsedTime       float64
CRSElapsedTime          int64
ArrDelay                float64
DepDelay                float64
Origin                  object
Dest                    object
Distance                float64
Cancelled               int64
Diverted                int64
UniqueCarrierEncoded    int32
OriginEncoded           int32
DestEncoded             int32
Late                    float64
dtypes: float64(7), int32(3), int64(10), object(3)
memory usage: 861.9+ MB


In [13]:
## Ensure all columns have the same number of data points
df.count()

Year                    5020651
Month                   5020651
DayofMonth              5020651
DayOfWeek               5020651
DepTime                 5020651
CRSDepTime              5020651
ArrTime                 5020651
CRSArrTime              5020651
UniqueCarrier           5020651
FlightNum               5020651
ActualElapsedTime       5020651
CRSElapsedTime          5020651
ArrDelay                5020651
DepDelay                5020651
Origin                  5020651
Dest                    5020651
Distance                5020651
Cancelled               5020651
Diverted                5020651
UniqueCarrierEncoded    5020651
OriginEncoded           5020651
DestEncoded             5020651
Late                    5020651
dtype: int64

In [14]:
## Create Random Forest model
rfr = RandomForestClassifier()
X = df.drop(columns = ['Late', 'UniqueCarrier', 'Origin', 'Dest'])
y = df['Late']

## Create Training and Test sets for cross-validation using only 10% of the data since the dataset is over 500K rows
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, train_size=0.1, random_state=42)

## Fit the model.
fit = rfr.fit(X_train, y_train)
print('done')



done


In [16]:
## Print R-squared values for training and test sets to check for overfitting
print('Test set R-squared value: ', rfr.score(X_train, y_train))
print('Train set R=squared value: ', rfr.score(X_test, y_test))

Test set R-squared value:  0.9998506169519883
Train set R=squared value:  0.9604295052841658


In [17]:
import warnings
warnings.filterwarnings("ignore")

## Check cross validation scores on training and test sets to ensure that prediction errors will be mininal for rest of data
print('Train set cross-validation scores: ', cross_val_score(rfr, X_train, y_train, cv=3))
print('Test set cross-validation scores: ', cross_val_score(rfr, X_test, y_test, cv=3))

Train set cross-validation scores:  [0.95942186 0.95929868 0.96054898]
Test set cross-validation scores:  [0.95607295 0.95791178 0.95812752]
