In [None]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [None]:
import pandas as pd

data_raw = pd.read_csv('fa23_datachallenge.csv')

#to play with our data we'll create a copy
data1 = data_raw.copy(deep = True)

print (data_raw.info())

## Data cleaning

In [None]:
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*10)

In [None]:
#delete the cabin feature/column and others previously stated to exclude in train dataset
drop_column = ['OP_UNIQUE_CARRIER', 
         'TAIL_NUM',
         'OP_CARRIER_FL_NUM', 
         'ORIGIN',
         'DEST', 
         'CRS_ARR_TIME',
         'CANCELLATION_CODE',
         'DEP_TIME',
         'ARR_TIME', 
         'CANCELLED',
         'CRS_ELAPSED_TIME',
         'ACTUAL_ELAPSED_TIME',
         'CARRIER_DELAY', 
         'WEATHER_DELAY',
         'NAS_DELAY',
         'TSUN',
         'SECURITY_DELAY',
         'LATE_AIRCRAFT_DELAY',
         'AIRLINE_AIRPORT_FLIGHTS_MONTH',
         'FLT_ATTENDANTS_PER_PASS']

data1.drop(drop_column, axis=1, inplace = True)

In [None]:
data1.drop('ORIGIN_AIRPORT_NAME', axis=1, inplace = True)

In [None]:
print(data1.isnull().sum())

In [None]:
data1.info()

In [None]:
print(data1.isnull().sum())

In [None]:
###COMPLETING: complete or delete missing values in train 

column_fill = ['PRCP','SNOW','SNWD','TMIN','TMAX','TAVG','AWND','PSUN','AIRPORT_FLIGHTS_MONTH','AIRLINE_FLIGHTS_MONTH', 
              'AVG_MONTHLY_PASS_AIRPORT','AVG_MONTHLY_PASS_AIRLINE','GROUND_SERV_PER_PASS','ARR_DELAY_NEW']

for column in column_fill:
    data1[column].fillna(data1[column].mean(), inplace = True)
    
### drop rows with empty departure delay information
data1.dropna(subset=['DEP_DELAY_NEW'], inplace=True)
    
print(data1.isnull().sum())

In [None]:
###CREATE: Feature Engineering for train and test/validation dataset

# data1['DELAYED'] = 0  #initialize to no/0 on time
# data1['DELAYED'].loc[data1['DEP_DELAY_NEW'] > 1] = 1

In [None]:
# no categorial data -- good

# define y variable aka target/outcome
Target = ['DEP_DEL15']

#define x variables for original features aka feature selection
data1_x = ['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','ORIGIN_AIRPORT_ID','ORIGIN_CITY_NAME', 'DEST_AIRPORT_ID',
           'DEST_CITY_NAME', 'CRS_DEP_TIME','DEP_DELAY_NEW','DEP_TIME_BLK','ARR_DELAY_NEW',
           'ARR_TIME_BLK','DISTANCE','DISTANCE_GROUP','PRCP','SNOW','SNWD','TMIN','TMAX','TAVG','AWND','PSUN','AIRPORT_FLIGHTS_MONTH',
           'AIRLINE_FLIGHTS_MONTH','AVG_MONTHLY_PASS_AIRPORT','AVG_MONTHLY_PASS_AIRLINE' ,'GROUND_SERV_PER_PASS']

#coded for algorithm calculation
data1_xy =  Target + data1_x
print('Original X Y: ', data1_xy, '\n')



In [None]:
# ### split dataset 25:75

# train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_x], data1[Target], random_state = 0)

# print("Data1 Shape: {}".format(data1.shape))
# print("Train1 Shape: {}".format(train1_x.shape))
# print("Test1 Shape: {}".format(test1_x.shape))

# train1_x.head()

In [None]:
#correlation heatmap of dataset
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(data1)

In [None]:
#DEP_DEL15 - dependent variable
data1.corr()[['DEP_DEL15']].sort_values(by='DEP_DEL15', ascending=False)

plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(data1.corr()[['DEP_DEL15']].sort_values(by='DEP_DEL15', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with DEP_DEL15', fontdict={'fontsize':18}, pad=16);



In [None]:
cor = data1.corr()
cor.head()

In [None]:
#picking out correlated columns
cor1 = []
cor2 = []
cor3 = []
cor4 = []
cor5 = []
corneg1 = []
corneg2 = []
corneg3 = []
corneg4 = []
corneg5 = []

for index, row in cor.iterrows():
    for col in cor.columns:
        if row[col] < 1 and row[col] >= 0.9: 
            cor1.append((index, col))
        elif row[col] < .9 and row[col] >= 0.6:
            cor2.append((index, col))
        elif row[col] < .6 and row[col] >= 0.4:
            cor3.append((index, col))
        elif row[col] < .4 and row[col] >= 0.2:
            cor4.append((index, col))
        elif row[col] < .2 and row[col] >= 0.1:
            cor5.append((index, col))
        elif row[col] <= -0.1 and row[col] > -0.2 :
            corneg5.append((index, col))
        elif row[col] <= -0.2 and row[col] > -0.4 :
            corneg4.append((index, col))
        elif row[col] <= -0.4 and row[col] > -0.6 :
            corneg3.append((index, col))
        elif row[col] <= -0.4 and row[col] > -0.9:
            corneg2.append((index, col))
        elif row[col] <= -0.9:
            corneg1.append((index, col))

In [None]:
cor1_df = pd.DataFrame(cor1) 
cor1_df

In [None]:
cor3_df = pd.DataFrame(cor3) 
cor3_df

In [None]:
### analyzed heatmap to select variables with high correlation
data1_n = ['ARR_DELAY_NEW','CRS_DEP_TIME', 'PRCP', 'AWND','SNOW','SNWD', 'AVG_MONTHLY_PASS_AIRPORT', 'DISTANCE', 'GROUND_SERV_PER_PASS', 'MONTH']

In [None]:
### split dataset 25:75

train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_n], data1[Target], random_state = 0)

print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))

train1_x.head()

In [None]:
### decision tree classifier
decision_model = tree.DecisionTreeClassifier()
decision_model.fit(train1_x, train1_y)
decision_model_predicted = decision_model.predict(test1_x)

In [None]:
from sklearn.metrics import accuracy_score
decision_model_accuracy = accuracy_score(decision_model_predicted, test1_y)
print(f'Decision Tree Accuracy: {decision_model_accuracy}')

In [None]:
##standardize dataset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train1_x_std = train1_x
test1_x_std = test1_x
  
train1_x_std = scaler.fit_transform(train1_x_std)
test1_x_std = scaler.fit_transform(test1_x_std)

train1_x_std = pd.DataFrame(train1_x_std, columns = data1_n)
test1_x_std = pd.DataFrame(test1_x_std, columns = data1_n)


print("Train1 standardized  Shape: {}".format(train1_x_std.shape))
print("Test1 standardized Shape: {}".format(test1_x_std.shape))

train1_x_std.head()

In [None]:
### decision tree classifier - standardized data
decision_model_std = tree.DecisionTreeClassifier()
decision_model_std.fit(train1_x_std, train1_y)
decision_model_std_predicted = decision_model_std.predict(test1_x_std)

decision_model_std_accuracy = accuracy_score(decision_model_std_predicted, test1_y)
print(f'Decision Tree Standardized Data Accuracy: {decision_model_std_accuracy}')

### accuracy got lower...

In [None]:
# from sklearn.tree import plot_tree
# a = plot_tree(decision_model, 
#               feature_names=data1_n, 
#               class_names=["DELAYED", "NOT DELAYED"], 
#               filled=True, 
#               rounded=True)

## Regression

In [None]:
#DEP_DEL15 - dependent variable
data1.corr()[['DEP_DELAY_NEW']].sort_values(by='DEP_DELAY_NEW', ascending=False)

plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(data1.corr()[['DEP_DELAY_NEW']].sort_values(by='DEP_DELAY_NEW', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with DEP_DELAY_NEW', fontdict={'fontsize':18}, pad=16);


In [None]:
data1_reg = ['ARR_DELAY_NEW','CRS_DEP_TIME', 'PRCP', 'AWND','SNOW','SNWD','MONTH','AVG_MONTHLY_PASS_AIRLINE','AIRLINE_FLIGHTS_MONTH']
Target_reg = ['DEP_DELAY_NEW']
### split dataset 25:75

train1_reg_x, test1_reg_x, train1_reg_y, test1_reg_y = model_selection.train_test_split(data1[data1_reg], data1[Target_reg], random_state = 0)

print("Train1 Shape: {}".format(train1_reg_x.shape))
print("Test1 Shape: {}".format(test1_reg_x.shape))

train1_reg_y.head()

In [None]:
reg_model = linear_model.LogisticRegressionCV()

reg_model.fit(train1_reg_x, train1_reg_y)
reg_model_predicted = reg_model.predict(test1_reg_x)
reg_model_accuracy = accuracy_score(reg_model_predicted, test1_reg_y)

print(f'Regression Accuracy: {reg_model_accuracy}')