In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os


# Flight

The dataset consists of flight data from 2009 through 2018 obtained from the Bureau of Transportation Statistics (https://www.bts.gov/topics/airlines-and-airports-0) 

# Original Dataset

The following flight data columns are within the CSV:
- code
- Year = Years used range from 2009 through 2018
- Month
- DOM_Flights = Domestic Flight
- INT_Flights = International Flight
- TOT_Flights = Total Flights
- DOM_Passengers = Domestic Passengers
- INT_Passengers = Internation Passengers
- TOT_Passengers = Total Passengers
- arr_flights = arriving flights
- arr_del15 = arriving delays (column O)
- Negative times represent early departures/arrivals in minutes
- total of carrier_ct, weather_ct, nas_ct, security_ct, late_aircraft_ct 
- ie, sum of columns P through T = O
- carrier_ct = carrier count
- weather_ct = weather count
- nas_ct = National Airspace Security count
- security_ct = Security count
- late_aircraft_ct = late aircraft count
- arr_cancelled = arrivals cancelled
- arr_diverted = arrivals diverted
- arr_delay = arrival delay (column W) 
- total of carrier_delay, weather_delay, nas_delay, security_delay and late_aircraft_delay ie, sum of X through AB
- carrier_delay = carrier delayed
- weather_delay = weather delay 
- nas_delay = National Airspace Security delay
- security_delay = Security delay
- late_aircraft_delay = Late aircraft delay

### SQL was used to load the data in a Postgres database and create the final data set: 
- Table Schema:  https://github.com/mdonatiello/Project3-Machine_Learning/blob/master/Resources/Create_Table_Query.sql
- SQL that processed data:  https://github.com/mdonatiello/Project3-Machine_Learning/blob/master/Resources/Data_CSV_File.sql
- Table design: https://github.com/mdonatiello/Project3-Machine_Learning/blob/master/Resources/QuickDBD-export.pdf


# Finalized Train / Test Dataset
#### Final Table output as csv file: Regression_Final_Table.csv which was used for the model

The following flight data columns were kept and used for our final CSV columns:

- code 
    - AA = American Airlines = 1
    - DL = Delta Airlines = 2
    - B6 = Jetblue Airways = 3
    - WN = Southwest Airlines = 4
    - UA = United Air Lines = 5
    
- Year
- Month = Months were converted to integers 1 - 12 
- Difference = Total_Flights from same month next year - Total_Flights from previous same month year
- carrier_delay 
- weather_delay
- nas_delay 
- security_delay 
- late_aircraft_delay 
- Inc-Dec = string representation of whether the # of passengers increased / decreased from previous year same month


In [2]:
#Load the cleaned up final dataset into a dataframe called "flight"
flight = pd.read_csv(os.path.join('Resources', 'Regression_Final_Table.csv'))
flight.head()

Unnamed: 0,NCode,Year,Month,difference,Carrier_delay,Weather_delay,NAS_delay,Security_delay,Late_aircraft_delay,inc_dec
0,1,2010,1,1632,2074,314,1354,30,1569,Increase
1,1,2011,1,10816,1982,375,1269,1,1587,Increase
2,1,2012,1,63801,1663,205,1023,5,991,Increase
3,1,2013,1,207535,1792,281,1153,14,1963,Increase
4,1,2014,1,126907,2138,608,1098,1,2485,Increase


In [3]:
flight = flight.dropna()

# Build the Model

In [4]:
# Assign X (data) and y (target)
X = flight.drop("inc_dec", axis=1)
y = flight["inc_dec"]
print(X.shape, y.shape)

(540, 9) (540,)


# Split our data into training and testing
#### Use shuffling and set test size of 33%

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.15)

# Create a Logistic Regression Model 

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

# Apply PowerTransformer Scaler
#### PowerTransformer applies a power transformation to each feature to make the data more Gaussian-like. The power transform finds the optimal scaling factor to stabilize variance and mimimize skewness through maximum likelihood estimation. By default, PowerTransformer also applies zero-mean, unit variance normalization to the transformed output. 


In [7]:
# Applying the Scalar
scalar = PowerTransformer(method='yeo-johnson').fit(X_train)
X_train_scaled = scalar.transform(X_train)
X_test_scaled = scalar.transform(X_test)

# Fit (train) or model using the training data with scaled data

In [8]:
classifier.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

# Validate the model using the test data

In [9]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.9738562091503268
Testing Data Score: 0.9506172839506173


# Make predictions

In [10]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:20]}")
print(f"First 10 Actual labels: {y_test[:20].tolist()}")

First 10 Predictions:   ['Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Decrease' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase' 'Increase' 'Increase' 'Increase' 'Increase'
 'Increase' 'Increase']
First 10 Actual labels: ['Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Decrease', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase', 'Increase']


In [11]:
df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
df.head(10)

Unnamed: 0,Prediction,Actual
0,Increase,Increase
1,Increase,Increase
2,Increase,Increase
3,Increase,Increase
4,Increase,Increase
5,Increase,Increase
6,Increase,Increase
7,Increase,Increase
8,Decrease,Decrease
9,Increase,Increase


In [12]:
# Save results 
df.to_csv('results.csv')