In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

# Flight
The dataset consists of flight data from 2009 through 2018 obtained from the Bureau of Transportation Statistics
(https://www.bts.gov/topics/airlines-and-airports-0) 

## Original Dataset
The following flight data columns are within the CSV:

* code
* Year = Years used range from 2009 through 2018
* Month
* DOM_Flights = Domestic Flight
* INT_Flights = International Flight
* TOT_Flights = Total Flights
* DOM_Passengers = Domestic Passengers
* INT_Passengers = Internation Passengers
* TOT_Passengers = Total Passengers
* arr_flights = arriving flights
* arr_del15 = arriving delays (column O)
    * Negative times represent early departures/arrivals in minutes
    * total of carrier_ct, weather_ct, nas_ct, security_ct, late_aircraft_ct 
    * ie, sum of columns P through T = O
* carrier_ct = carrier count
* weather_ct = weather count
* nas_ct = National Airspace Security count
* security_ct = Security count
* late_aircraft_ct = late aircraft count
* arr_cancelled = arrivals cancelled
* arr_diverted = arrivals diverted
* arr_delay = arrival delay (column W) 
    * total of carrier_delay, weather_delay, nas_delay, security_delay and late_aircraft_delay 
    * ie, sum of X through AB
* carrier_delay = carrier delayed
* weather_delay = weather delay 
* nas_delay = National Airspace Security delay
* security_delay = Security delay
* late_aircraft_delay = Late aircraft delay

## Finalized Train / Test Dataset 
The following flight data columns were kept and used for our final CSV columns:
* code
    * AA = American Airlines = 1
    * DL = Delta Airlines = 2
    * B6 = Jetblue Airways = 3
    * WN = Southwest Airlines = 4
    * UA = United Air Lines = 5
* Year
* Month = Months were converted to integers 1 - 12  
* Difference = Total_Flights from same month next year - Total_Flights from previous same month year
* carrier_delay 
* weather_delay
* nas_delay 
* security_delay 
* late_aircraft_delay 
* Inc-Dec = string representation of whether the # of passengers increased / decreased from previous year same month


In [2]:
flight = pd.read_csv(os.path.join('Resources', 'combined2_TotalsRemoved.csv'))
flight.head()

Unnamed: 0,code,Year,Month,DOM_Flights,INT_Flights,DOM_Passengers,INT_Passengers,Difference,arr_flights,arr_del15,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,Inc-Dec
0,1,2009,1,46183,12066.0,5046794,1646798.0,0,612.04,127.27,...,0.09,34.31,21.38,1.5,2257.21,798.92,1947.49,3.12,2292.0,NotApplicable
1,1,2010,1,45545,11842.0,5071320,1623904.0,1632,589.51,103.08,...,0.28,26.69,14.63,1.65,2073.8,313.73,1354.34,30.0,1568.9,Increase
2,1,2011,1,44782,12562.0,4984095,1721945.0,10816,591.68,98.9,...,0.01,26.75,19.1,1.26,1981.55,374.72,1269.32,0.65,1586.72,Increase
3,1,2012,1,43139,12567.0,4997279,1772562.0,63801,533.3,77.4,...,0.13,17.65,8.76,1.13,1662.66,204.76,1023.33,4.85,990.61,Increase
4,1,2013,1,44107,12785.0,5195596,1781780.0,207535,546.93,101.11,...,0.19,33.53,10.52,1.15,1792.35,281.07,1152.56,13.63,1962.73,Increase


In [3]:
flight = flight.dropna()

In [4]:
# Assign X (data) and y (target)
X = flight.drop("Inc-Dec", axis=1)
y = flight["Inc-Dec"]
print(X.shape, y.shape)

(546, 22) (546,)


Split our data into training and testing

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Fit (train) or model using the training data

In [7]:
scalar = StandardScaler().fit(X_train)
X_train_scaled = scalar.transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [8]:
classifier.fit(X_train_scaled,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Validate the model using the test data

In [9]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.863080684596577
Testing Data Score: 0.8029197080291971


Make predictions

In [10]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   ['Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase']
First 10 Actual labels: ['Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Decrease', 'Decrease', 'Increase']


In [11]:
# pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [12]:
final = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [13]:
# Total rows for the final data
len(final)

137

In [14]:
final.to_csv('final.csv')

In [15]:
predictions_train = classifier.predict(X_train)
print(f"First 10 Predictions:   {predictions_train[:100]}")
print(f"First 10 Actual labels: {y_train[:100].tolist()}")

First 10 Predictions:   ['Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Inc