In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

# Flight
The dataset consists of flight data from 2009 through 2018 obtained from the Bureau of Transportation Statistics
(https://www.bts.gov/topics/airlines-and-airports-0) 

## Original Dataset
The following flight data columns are within the CSV:

* ID = primary key
* Ncode = airline code
* Year = Years used range from 2009 through 2018
* Month
* DOM_Flights = Domestic Flight
* INT_Flights = International Flight
* TOT_Flights = Total Flights
* DOM_Passengers = Domestic Passengers
* INT_Passengers = Internation Passengers
* TOT_Passengers = Total Passengers
* arr_flights = arriving flights
* arr_del15 = arriving delays (column O)
    * Negative times represent early departures/arrivals in minutes
    * total of carrier_ct, weather_ct, nas_ct, security_ct, late_aircraft_ct 
    * ie, sum of columns P through T = O
* carrier_ct = carrier count
* weather_ct = weather count
* nas_ct = National Airspace Security count
* security_ct = Security count
* late_aircraft_ct = late aircraft count
* arr_cancelled = arrivals cancelled
* arr_diverted = arrivals diverted
* arr_delay = arrival delay (column W) 
    * total of carrier_delay, weather_delay, nas_delay, security_delay and late_aircraft_delay 
    * ie, sum of X through AB
* carrier_delay = carrier delayed
* weather_delay = weather delay 
* nas_delay = National Airspace Security delay
* security_delay = Security delay
* late_aircraft_delay = Late aircraft delay

## Finalized Train / Test Dataset 
The following flight data columns were kept and used for our final CSV columns:
* Ncode
    * AA = American Airlines = 1
    * DL = Delta Airlines = 2
    * B6 = Jetblue Airways = 3
    * WN = Southwest Airlines = 4
    * UA = United Air Lines = 5
* Year
* Month = Months were converted to integers 1 - 12  
* Difference = Total_Flights from same month next year - Total_Flights from previous same month year 
* carrier_delay 
* weather_delay
* nas_delay 
* security_delay 
* late_aircraft_delay 
* inc-dec = string representation of whether the # of passengers increased / decreased from previous year same month


In [2]:
flight = pd.read_csv(os.path.join('Resources', 'Regression_Final_Table.csv'))
flight.head()

Unnamed: 0,NCode,Year,Month,difference,Carrier_delay,Weather_delay,NAS_delay,Security_delay,Late_aircraft_delay,inc_dec
0,1,2010,1,1632,2074,314,1354,30,1569,Increase
1,1,2011,1,10816,1982,375,1269,1,1587,Increase
2,1,2012,1,63801,1663,205,1023,5,991,Increase
3,1,2013,1,207535,1792,281,1153,14,1963,Increase
4,1,2014,1,126907,2138,608,1098,1,2485,Increase


In [3]:
flight = flight.dropna()

In [4]:
# Assign X (data) and y (target)
X = flight.drop("inc_dec", axis=1)
y = flight["inc_dec"]
print(X.shape, y.shape)

(540, 9) (540,)


Split our data into training and testing

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Fit (train) or model using the training data

In [7]:
scalar = StandardScaler().fit(X_train)
X_train_scaled = scalar.transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [8]:
classifier.fit(X_train_scaled,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Validate the model using the test data

In [9]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.9580246913580247
Testing Data Score: 0.9481481481481482


Make predictions

In [10]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase']
First 10 Actual labels: ['Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase']


In [11]:
# pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [12]:
final = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [13]:
# Total rows for the final data
len(final)

135

In [14]:
final.to_csv('final.csv')

In [15]:
predictions_train = classifier.predict(X_train)
print(f"First 10 Predictions:   {predictions_train[:100]}")
print(f"First 10 Actual labels: {y_train[:100].tolist()}")

First 10 Predictions:   ['Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Decrease'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Decrease'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Decrease'
 'Increase' 'Decrease' 'Decrease' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Decrease' 'Increase' 'Increase' 'Increase' 'Decrease' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Decrease'
 'Increase' 'Increase' 'Increase' 'Decrease' 'Increase' 'Decrease'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Decrease' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Decrease' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Inc

In [16]:
# Testing our pedictions_train 
future_data = pd.read_csv(os.path.join('Resources', 'Regression_Test.csv'))
future_data.head()                                   

Unnamed: 0,NCode,Year,Month,difference,Carrier_delay,Weather_delay,NAS_delay,Security_delay,Late_aircraft_delay,inc_dec
0,1,2018,1,213038,2656,388,1576,12,2409,Increase
1,1,2018,2,543299,2319,379,2051,8,2583,Increase
2,1,2018,3,331307,2564,185,1290,15,2376,Increase
3,1,2018,4,181855,2471,296,1620,8,2472,Increase
4,1,2018,5,223494,3437,615,2628,8,4364,Increase


In [17]:
# Assign X (data) and y (target)
X = future_data.drop("inc_dec", axis=1)
print(X.shape)

(60, 9)


In [18]:
X_future = scalar.transform(X)

In [24]:
future_data_predictions = classifier.predict(X_future)
print(f"First 8 Predictions: {future_data_predictions[:10]}")
print(f"First 10 Predictions: {predictions_train[:10]}")


First 8 Predictions: ['Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase']
First 10 Predictions: ['Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Decrease'
 'Increase' 'Increase' 'Increase' 'Increase']
