# Matt's Code Playground

In [1]:
# Standard library imports
import os
import sys

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt

# Custom helper classes
from modelhelper import ModelHelper
from helperclasses import DataFetcherKAGGLE, DataFrameHelper
# Scikit-learn preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Scikit-learn models
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor
)
from sklearn.linear_model import (
    LogisticRegression,
    LinearRegression
)
from sklearn.svm import SVC, SVR

# Scikit-learn metrics
from sklearn.metrics import (
    accuracy_score,
    mean_squared_error,
    r2_score
)

data_fetcher = DataFetcherKAGGLE()
data_frame_helper = DataFrameHelper()
model_helper = ModelHelper()

In [2]:
flight_data = data_fetcher.fetch_flight_dataset()

Fetching flight dataset from Kaggle...
Preparing flight dataset...
Optimizing flight dataset data types...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(pd.to_datetime(df[col]).astype(np.int64))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')
A value is trying to be set on a copy o

In [3]:
columns = ['AIRLINE_DOT', 'AIRLINE_CODE','DOT_CODE', 'FL_NUMBER', 'CRS_DEP_TIME', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME','ARR_DELAY', 'CRS_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'DEST', 'DEST_CITY', 'ELAPSED_TIME', 'DEP_TIME', 'ORIGIN_CITY', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED']
flight_data = data_frame_helper.drop_columns(flight_data, columns)

In [4]:
display(flight_data.info())
display(flight_data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1525580 entries, 1 to 2999997
Data columns (total 9 columns):
 #   Column                   Non-Null Count    Dtype   
---  ------                   --------------    -----   
 0   FL_DATE                  1525580 non-null  int64   
 1   AIRLINE                  1525580 non-null  category
 2   ORIGIN                   1525580 non-null  category
 3   DEP_DELAY                1493229 non-null  float64 
 4   DELAY_DUE_CARRIER        318933 non-null   float64 
 5   DELAY_DUE_WEATHER        318933 non-null   float64 
 6   DELAY_DUE_NAS            318933 non-null   float64 
 7   DELAY_DUE_SECURITY       318933 non-null   float64 
 8   DELAY_DUE_LATE_AIRCRAFT  318933 non-null   float64 
dtypes: category(2), float64(6), int64(1)
memory usage: 97.5 MB


None

Unnamed: 0,FL_DATE,AIRLINE,ORIGIN,DEP_DELAY,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
1,1668816000000000000,Delta Air Lines Inc.,MSP,-6.0,,,,,
2,1658448000000000000,United Air Lines Inc.,DEN,6.0,,,,,
3,1678060800000000000,Delta Air Lines Inc.,MSP,-1.0,0.0,0.0,24.0,0.0,0.0
6,1686441600000000000,American Airlines Inc.,DCA,-9.0,,,,,
8,1676160000000000000,Spirit Air Lines,IAH,-3.0,,,,,


In [None]:
X_train, X_test, y_train, y_test = model_helper.prepare_data(flight_data, 'DEP_DELAY')
display(X_train.info())
display(X_train.head())
display(y_train.head())

INFO:modelhelper:Removed 433207 duplicate rows
INFO:modelhelper:Missing values found:
DEP_DELAY                    48736
DELAY_DUE_CARRIER          2033013
DELAY_DUE_WEATHER          2033013
DELAY_DUE_NAS              2033013
DELAY_DUE_SECURITY         2033013
DELAY_DUE_LATE_AIRCRAFT    2033013
dtype: int64


<class 'pandas.core.frame.DataFrame'>
Index: 2053434 entries, 2578300 to 2549370
Data columns (total 8 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   FL_DATE                  float64
 1   AIRLINE                  float64
 2   ORIGIN                   float64
 3   DELAY_DUE_CARRIER        float64
 4   DELAY_DUE_WEATHER        float64
 5   DELAY_DUE_NAS            float64
 6   DELAY_DUE_SECURITY       float64
 7   DELAY_DUE_LATE_AIRCRAFT  float64
dtypes: float64(8)
memory usage: 141.0 MB


None

Unnamed: 0,FL_DATE,AIRLINE,ORIGIN,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
2578300,-0.339792,-1.127314,1.123019,-0.127005,-0.055529,-0.170304,-0.018208,-0.192772
190698,0.608001,0.760018,-0.901095,-0.127005,-0.055529,-0.170304,-0.018208,-0.192772
2658551,1.418181,1.10317,-0.575262,-0.127005,-0.055529,-0.170304,-0.018208,-0.192772
2703560,-1.597702,0.760018,0.570091,-0.243867,6.953949,0.077828,-0.018208,0.099214
2076756,-0.049058,-0.955738,-1.641624,-0.127005,-0.055529,-0.170304,-0.018208,-0.192772


2578300      0.0
190698     -21.0
2658551      3.0
2703560    112.0
2076756     11.0
Name: DEP_DELAY, dtype: float64

MemoryError: could not allocate 24779948032 bytes

In [None]:
model = model_helper.train_model(X_train, y_train, 'random_forest', 'classification')