<a href="https://colab.research.google.com/github/obeabi/ProjectPortfolio/blob/master/FlightCancellation_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Written by Abiola Obembe
# SDS Challenge #1 - Flight Cancellations
## 2020-10-30



## Problem Statement
I have been hired by the US Department of Transportation (DOT) to analyze data from multiple airline carriers in the United States. The DOT wants to help airline carriers reduce the number of flight cancellations and improve travelers' experiences. My job is to help the DOT predict whether or not a flight will be canceled based on the data provided.

### Step 1: Data Preprocessing Tools

In [1]:

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
!pip install imbalanced-learn
!pip install category_encoders
import category_encoders as ce

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

print('Libraries installed successfully!')

Libraries installed successfully!


  import pandas.util.testing as tm


In [None]:
# Importing the dataset and check the shape and total number of missing values

df_train = pd.read_csv('public_flights.csv')

df_train.head()



In [None]:
# print target column name as a list
target_column = [df_train.columns[-1]]

print(target_column)

In [None]:
# Examine target column for unbalanced data
df_train[target_column].value_counts()

In [None]:
count_classes = pd.value_counts(df_train['CANCELLED'], sort = True)
count_classes.plot(kind = 'bar', rot = 0)
plt.title("Cancelled Flights Distribution")
plt.xticks(range(2))
plt.xlabel("CANCELLED")
plt.ylabel('Frequency')
plt.show()

In [None]:
# gET THE CANCELLED AND NOT CANCEELED FLIFGTS
Cancelled = df_train[df_train['CANCELLED']== 1]
notCancelled = df_train[df_train['CANCELLED']== 0]

print(Cancelled.shape, notCancelled.shape)

In [None]:
# Importing the train-set nd check the shape and total number of missing values

print("The shape of the train-set is:", (df_train.shape))
print("The number of rows in the train-set is:", str(df_train.shape[0]))
print("The number of columns in the train-set is:", str(df_train.shape[1]))

missing_valuestrain = df_train.isnull().sum().sum()

print("The number of missing values in the train-set is:", str(missing_valuestrain))

In [None]:
# create dataframe for test test
df_test = pd.read_csv('pred_flights.csv')
df_test.head()

In [None]:
# test set dataframe is missing headers information so we will re-initailize it, deleting the target column label
labels = df_train.columns

new_label= labels[:-1]  # select all column labels except the target column

#new_label
df_test = pd.read_csv('pred_flights.csv', header =None, names = new_label)

df_test.head()

In [None]:
# Importing the test-set and check the shape and total number of missing values

print("The shape of the test-set is:", (df_test.shape))
print("The number of rows in the test-set is:", str(df_test.shape[0]))
print("The number of columns in the test-set is:", str(df_test.shape[1]))

missing_valuestest = df_test.isnull().sum().sum()

print("The number of missing values in the test-set is:", str(missing_valuestest))

### Step 2: Feature Engineering (Training and Test Set )

In [None]:
# Examine the data types for train dataframe
df_train.dtypes

In [None]:
# Check for training set data frame info
df_train.info()

In [None]:
# Dealing with missing values in the train-set

df_train.isnull().sum()

#### (A) Dealing with Missing Values

In [None]:
# Remove rows with missing target, separate target from predictors
X_full = df_train.copy()
X_full.dropna(axis=0, subset=['CANCELLED'], inplace=True)
y = X_full.CANCELLED
X_full.drop(['CANCELLED'], axis=1, inplace=True)

# Show X_full dataframe representing predictors only
X_full.head()

In [None]:
# Print first 10 entries of target column
y[0:10]

In [None]:
# Break off validation set from training data
from sklearn.model_selection import train_test_split
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,train_size=0.8, test_size=0.2, random_state=0)

# Organize test set
X_test_full = df_test.copy()

In [None]:
# Select categorical columns from X_train_full
# All categorical columns
category_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == "object"]
category_cols

In [None]:
# Print object caregories for X_train_full for inspection
X_train_full[category_cols].head()

In [None]:
# Let us see the cardanilatity of each column for the categorical columns in X_train_full and X_valid_full
# Unique value sin each columns
for cols in category_cols:
    print("Unique values in", cols,  "column in training data:", X_train_full[cols].nunique())
    print("\nUnique values in" , cols,  "column in validation data:", X_valid_full[cols].nunique())

##### We observe that the cardinality of the columns are all greater than 10 hence one-hot encoding will lead to a huge number of new columns and hence result in the curse of dimensionality problem. Hence in this project we proceed with label encoding for the baseline project. We hope to improve the categorical encoding columns with the category encoder library in the future using the count encoder, target encoder and catboost encoder.

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and
                    X_train_full[cname].dtype == "object"]
low_cardinality

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
high_cardinality = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() > 10 and
                    X_train_full[cname].dtype == "object"]
high_cardinality

##### We observe that all columns have high cardinality

In [None]:
# Columns that can be safely label encoded
good_label_cols = [col for col in category_cols if
                   set(X_train_full[col]) == set(X_valid_full[col])]


# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(category_cols) - set(good_label_cols))

print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)


In [None]:
# Drop categorical columns that will not be encoded
label_X_train = X_train_full.drop(bad_label_cols, axis=1)
label_X_valid = X_valid_full.drop(bad_label_cols, axis=1)
label_X_test = X_test_full.drop(bad_label_cols, axis=1)

# print dataframe for inspection
label_X_train.head()

In [None]:
# Apply label encoder to each column with categorical data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in good_label_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train_full[col])
    label_X_valid[col] = label_encoder.transform(X_valid_full[col])
    label_X_test[col] = label_encoder.transform(X_test_full[col])

label_X_train.head(10)

In [None]:
# Select numerical columns
numerical_cols = [cname for cname in label_X_train.columns if
                label_X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
categorical_cols = good_label_cols
my_cols = categorical_cols + numerical_cols  # sometime use category_cols instead of good_label_cols for one-hot encoding
X_train = label_X_train[my_cols].copy()
X_valid = label_X_valid[my_cols].copy()
X_test = label_X_test[my_cols].copy()


In [None]:
# Define Preprocessing Steps and import dependecies
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')
#numerical_transformer = KNNImputer(n_neighbors=3)
#numerical_transformer = Pipeline(steps=[('imputer',SimpleImputer() ),('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])
#categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder())])



### Model using Decision Tree Classifier

In [None]:
# Step 1 : Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[ ('num', numerical_transformer, numerical_cols),
                ('cat', categorical_transformer, categorical_cols) ])

# Step 2: Define the Model
from sklearn.tree import DecisionTreeClassifier
model_1 = DecisionTreeClassifier(random_state=0)

# Step 3: Create and Evaluate the Pipeline
from sklearn.metrics import accuracy_score,average_precision_score,f1_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model_1)])

# Preprocessing of training data, fit model
# implement under_sampling to account for unbalanced data
#from imblearn.under_sampling import NearMiss

#rm = NearMiss(random_state = 1)
#label_X_train_res, y_train_res = rm.fit_sample(label_X_train,y_train)
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
avg_precisionScore = average_precision_score(y_valid, preds)

print('Accuracy Score:', score)
print('Average Precision Score:', avg_precisionScore)
print("The macro averaged f1_score is :", f1_score(y_valid, preds, average='macro'))
print("The mairo averaged f1_score is :", f1_score(y_valid, preds, average='micro'))
print("The weighted averaged f1_score is :", f1_score(y_valid, preds, average='weighted'))
print("The  f1_score is :", f1_score(y_valid, preds, average='macro'))

In [None]:
# Cross validation Score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(my_pipeline, X_train, y_train, cv=3)

# Print the mean score and 95% confidence interval
print("Accuracy: %0.2f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

In [None]:
# print classification report (confusion matrix)
from sklearn.metrics import classification_report
print (classification_report(y_valid, preds))

In [None]:
# Use imbalance library to calculate sensitiivty score  of model
from imblearn.metrics import sensitivity_score
print(sensitivity_score(y_valid, preds, average='macro'))
print(sensitivity_score(y_valid, preds, average='micro'))
print(sensitivity_score(y_valid, preds, average='weighted'))
print(sensitivity_score(y_valid, preds, average=None))


In [None]:
# Use imbalance library to calculate specificity score  of model
from imblearn.metrics import specificity_score
print(specificity_score(y_valid, preds, average='macro'))
print(specificity_score(y_valid, preds, average='micro'))
print(specificity_score(y_valid, preds, average='weighted'))
print(specificity_score(y_valid, preds, average=None))

In [None]:
# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_test)

# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,'CANCELLED': preds_test})
output.to_csv('submission_DT.csv', index=False)

### Model using Random Forest Classifier

In [None]:
# Step 1 : Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[ ('num', numerical_transformer, numerical_cols),
                ('cat', categorical_transformer, categorical_cols) ])

# Step 2: Define the Model
from sklearn.ensemble import RandomForestClassifier
model_2 = RandomForestClassifier(n_estimators=100, random_state=0)

# Step 3: Create and Evaluate the Pipeline
from sklearn.metrics import accuracy_score,average_precision_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model_2)])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
avg_precisionScore = average_precision_score(y_valid, preds)

print('Accuracy Score:', score)
print('Average Precision Score:', avg_precisionScore)
print("The macro averaged f1_score is :", f1_score(y_valid, preds, average='macro'))
print("The mairo averaged f1_score is :", f1_score(y_valid, preds, average='micro'))
print("The weighted averaged f1_score is :", f1_score(y_valid, preds, average='weighted'))
print("The  f1_score is :", f1_score(y_valid, preds, average='macro'))

In [None]:
# Cross validation Score
##scores = cross_val_score(my_pipeline, X_train, y_train, cv=5)

# Print the mean score and 95% confidence interval

#print("Accuracy: %0.2f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

In [None]:
# print classification report (confusion matrix)
from sklearn.metrics import classification_report
print (classification_report(y_valid, preds))

In [None]:
# Use imbalance library to calculate sensitiivty score  of model
from imblearn.metrics import sensitivity_score
print(sensitivity_score(y_valid, preds, average='macro'))
print(sensitivity_score(y_valid, preds, average='micro'))
print(sensitivity_score(y_valid, preds, average='weighted'))
print(sensitivity_score(y_valid, preds, average=None))

In [None]:
# Use imbalance library to calculate specificity score  of model
from imblearn.metrics import specificity_score
print(specificity_score(y_valid, preds, average='macro'))
print(specificity_score(y_valid, preds, average='micro'))
print(specificity_score(y_valid, preds, average='weighted'))
print(specificity_score(y_valid, preds, average=None))

In [None]:
# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_test)

# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,'CANCELLED': preds_test})
output.to_csv('submission_RF.csv', index=False)

### Model using SVM Classifier (radial Kerenel)

### Model using Naive Bayes Classifier

In [None]:
# Step 1 : Bundle preprocessing for numerical and categorical data
numerical_transformer = Pipeline(steps=[('imputer',SimpleImputer() ),('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[ ('num', numerical_transformer, numerical_cols),
                ('cat', categorical_transformer, categorical_cols) ])

# Step 2: Define the Model
from sklearn.naive_bayes import GaussianNB
model_5 = GaussianNB()


# Step 3: Create and Evaluate the Pipeline
from sklearn.metrics import accuracy_score,average_precision_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model_5)])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
avg_precisionScore = average_precision_score(y_valid, preds)

print('Accuracy Score:', score)
print('Average Precision Score:', avg_precisionScore)
print("The macro averaged f1_score is :", f1_score(y_valid, preds, average='macro'))
print("The mairo averaged f1_score is :", f1_score(y_valid, preds, average='micro'))
print("The weighted averaged f1_score is :", f1_score(y_valid, preds, average='weighted'))
print("The  f1_score is :", f1_score(y_valid, preds, average='macro'))

In [None]:
# Cross validation Score
scores = cross_val_score(my_pipeline, X_train, y_train, cv=5)
# Print the mean score and 95% confidence interval
print("Accuracy: %0.2f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

In [None]:
# print classification report (confusion matrix)
from sklearn.metrics import classification_report
print (classification_report(y_valid, preds))

In [None]:
# Use imbalance library to calculate sensitiivty score  of model
from imblearn.metrics import sensitivity_score
print(sensitivity_score(y_valid, preds, average='macro'))
print(sensitivity_score(y_valid, preds, average='micro'))
print(sensitivity_score(y_valid, preds, average='weighted'))
print(sensitivity_score(y_valid, preds, average=None))

In [None]:
# Use imbalance library to calculate specificity score  of model
from imblearn.metrics import specificity_score
print(specificity_score(y_valid, preds, average='macro'))
print(specificity_score(y_valid, preds, average='micro'))
print(specificity_score(y_valid, preds, average='weighted'))
print(specificity_score(y_valid, preds, average=None))

In [None]:
# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_test)

# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,'CANCELLED': preds_test})
output.to_csv('submission_NB.csv', index=False)