In [9]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import numpy as np
from pyspark.sql.types import StructField, StringType, StructType
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
import matplotlib.pyplot as plt
import time as tm
import itertools

In [4]:
usePyspark = False
path = './data'
worker_nodes = "*"
problem_to_solve = 'CANCELLED'

dataset_limit = 10000
use_all_dataset_frames = True

# DA SCRIVERE
- perche' usiamo i dataframe invece degli rdd
- aggiungere k fold cross validation
- aggiungere griglia parametri
- aggiungere label stratification
- aggiungere performance modello pyspark
- aggiungere check e info extra su dataset di base (es sbilanciamento)
- auroc, auprc, f1, 
- confronto con tree classifier

## Data Download

In [None]:
os.environ['KAGGLE_USERNAME'] = "davidetricella"
os.environ['KAGGLE_KEY'] = "e1ab3aae4a07f36b37a3a8bace74d9df"


dataset = 'yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018'
path = './data'

In [None]:
def download_dataset():
    if not os.path.isdir(path):
        os.mkdir(path)
    if not os.listdir(path):
        try:
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_files(dataset, path, unzip=True, quiet=False)
        except:
            print("Error downloading the dataset")

## Data Loading

In [None]:
dataframe_schema = StructType([
    StructField('FL_DATE', StringType(), True),
    StructField('OP_CARRIER', StringType(), True),
    StructField('ORIGIN', StringType(), True),
    StructField('DEST', StringType(), True),
    StructField('CRS_DEP_TIME', StringType(), True),
    StructField('CRS_ARR_TIME', StringType(), True),
    StructField('CANCELLED', StringType(), True),
    StructField('DIVERTED', StringType(), True),
    StructField('CRS_ELAPSED_TIME', StringType(), True),
    StructField('DISTANCE', StringType(), True)
])

columns_to_get = [
    'FL_DATE',
    'OP_CARRIER',
    'ORIGIN',
    'DEST',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CANCELLED',
    'DIVERTED',
    'CRS_ELAPSED_TIME',
    'DISTANCE'
]


if usePyspark:
    spark = SparkSession.builder \
    .appName("Airline Departure") \
    .master('local[' + worker_nodes + ']') \
    .getOrCreate()

In [None]:
def load_dataset(usePyspark: bool):
    if usePyspark:
        data = spark.read.format("csv") \
            .option("header", True) \
            .load(path + '/preprocessed')
    else:
        data = pd.read_csv(filepath_or_buffer=path + '/' + 'preprocessed.csv')

    print('Preprocessed dataset loaded')
    return data

def save_dataset(data, usePyspark: bool):
    if usePyspark:
        data.write.format('csv').option('header', True).mode('overwrite').option(
            'sep', ',').save(path + '/preprocessed')
    else:
        data.to_csv(path_or_buf=path + '/' + 'preprocessed.csv', index=False)
    print('Preprocessed dataset saved')

def check_preprocessed_data_exists() -> bool:
    files = os.listdir('./data')
    for f in files:
        if f.startswith('preprocessed'):
            return True
    return False

def get_dataset(limit: float = -1, allFrames: bool = True, usePyspark: bool = False):
    files = os.listdir(path)
    if usePyspark:
        big_frame = spark.createDataFrame(
            spark.sparkContext.emptyRDD(), schema=dataframe_schema)
    else:
        big_frame = pd.DataFrame()

    if not allFrames:
        files = [files[0]]

    for f in files:
        if f.endswith('.csv'):
            if usePyspark:
                frame = spark.read.option("header", True).csv(path + '/' + f)
                frame = frame.select(columns_to_get)
                frame = frame.sample(fraction=1.0, withReplacement=False)

                if limit != -1:
                    frame = frame.limit(limit)

                big_frame = frame.union(big_frame)
            else:
                frame = pd.read_csv(filepath_or_buffer=path +
                                    '/' + f, usecols=columns_to_get)
                if limit != -1:
                    frame = frame.sample(n=limit, replace=False)
                big_frame = pd.concat([big_frame, frame])

    if usePyspark:
        big_frame = big_frame.select(
            "*").withColumn("index", monotonically_increasing_id())
        big_frame.count()

    return big_frame


## Preprocessing

In [None]:
default_values = {
    'CANCELLED': 0,
    'DIVERTED': 0
}

columns_to_remove_for_canceled = [
    'DIVERTED',  # the flight has been diverted to an unplanned airport
]

columns_to_remove_for_diverted = [
    'CANCELLED',  # the flight has been cancelled
]

names_columns_to_convert = [
    'OP_CARRIER',
    'ORIGIN',
    'DEST',
]

date_columns_to_convert = [
    'FL_DATE'
]

time_columns_to_convert = [
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CRS_ELAPSED_TIME'
]

numeric_columns_to_convert = [
    'DISTANCE'
]

string_columns_to_convert = [
    'CANCELLED',
    'DIVERTED'
]

preprocess_columns_to_convert = [
    'OP_CARRIER',
    'ORIGIN',
    'DEST',
    'FL_DATE',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
    'CANCELLED',
    'DIVERTED',
    'index'
]

max_distance = 4970

In [1]:
time_file = open("./data/times.txt", "a")

def print_and_save_time(s: str):
    time_file.write(s + '\n')
    print(s)


def common_preprocess(data: ps.DataFrame | pd.DataFrame, usePyspark: bool) -> ps.DataFrame | pd.DataFrame:

    common_start_time = tm.time()

    if usePyspark:
        # Replace Nan values with the correct default values
        data = data.fillna(value=0)
        # Remove rows with Nan key values
        data = data.dropna(how='any')
    else:
        data.fillna(value=0, inplace=True)
        data.dropna(how='any', axis='index', inplace=True)

    null_removal_finish_time = tm.time() - common_start_time
    print_and_save_time("Null values removal concluded: " +
                        str(null_removal_finish_time) + " seconds")

    names_start_time = tm.time()
    data = convert_names_into_numbers(data, usePyspark)
    names_finish_time = tm.time() - names_start_time
    print_and_save_time("Names conversion concluded: " +
                        str(names_finish_time) + " seconds")

    dates_start_time = tm.time()
    data = convert_dates_into_numbers(data, usePyspark)
    dates_finish_time = tm.time() - dates_start_time
    print_and_save_time("Dates conversion concluded: " +
                        str(dates_finish_time) + " seconds")

    times_start_time = tm.time()
    data = convert_times_into_numbers(data, usePyspark)
    times_finish_time = tm.time() - times_start_time
    print_and_save_time("Times conversion concluded: " +
                        str(times_finish_time) + " seconds")

    distance_start_time = tm.time()
    data = convert_distance_into_numbers(data, usePyspark)
    distance_finish_time = tm.time() - distance_start_time
    print_and_save_time("Distance conversion concluded: " +
                        str(distance_finish_time) + " seconds")

    if usePyspark:
        strings_start_time = tm.time()
        data = convert_strings_into_numbers(data)
        strings_finish_time = tm.time() - strings_start_time
        print_and_save_time("Strings conversion concluded: " +
                            str(strings_finish_time) + " seconds")

    common_finish_time = tm.time() - common_start_time
    print_and_save_time("Common preprocessing concluded: " +
                        str(common_finish_time) + " seconds")
    return data

if not check_preprocessed_data_exists():
        download_dataset()

        start_time = tm.time()
        data = get_dataset(dataset_limit, use_all_dataset_frames, usePyspark)

        finish_time = tm.time() - start_time
        print_and_save_time("Dataset reading concluded: " +
                            str(finish_time) + " seconds")

        data = common_preprocess(data, usePyspark)
        read.save_dataset(data, usePyspark)
else:
    data = read.load_dataset(usePyspark)
    if usePyspark:
        udf_string_conversion = udf(lambda x: float(x), DoubleType())
        for c in preprocess_columns_to_convert:
            data = data.withColumn(c, udf_string_conversion(col(c)))
    
data = remove_extra_columns(index, data, usePyspark)

start_time = tm.time()
preprocessing_splits = split_data(data, usePyspark, index, split_number)

if usePyspark:

    finish_time = tm.time() - start_time
    print_and_save_time("Dataset splitting concluded: " +
                        str(finish_time) + " seconds")
else:
    finish_time = tm.time() - start_time
    print_and_save_time("Dataset splitting concluded: " +
                        str(finish_time) + " seconds")

# Preprocess

## Da aggiungere Z score normalization

## Models

### Generic Functions

In [2]:
def sigmoid(x):
    '''
    Calculates the sigmoid of the given data
    '''
    g = 1.0 / (1.0 + np.exp(-x))
    return g

def binary_cross_entropy(y, y_label, w, l2):
    '''
    Calculates the binary cross entropy loss of the calculated y and the given y_label
    '''
    loss = -np.mean(y_label*(np.log(y)) + (1-y_label)
                    * np.log(1-y)) + regularize(w, l2)
    return loss

def regularize(W, l2):
    '''
    Calculates the regularization term for the loss
    '''
    return (l2 / 2) * np.sum(np.square(W))

### Numpy Model

In [None]:
class LogisticRegression():
    def __init__(self, learning_rate: float, batch_size: int, l2: float):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.l2 = l2

    def initialize(self, columns_number):
        self.W = np.random.rand(columns_number)
        self.b = np.random.rand()

    def evaluate(self, X):
        Z = np.dot(X, self.W) + self.b
        Z = sigmoid(Z)
        return Z

    def gradient(self, X, Y, Y_label):
        '''
        Calculates the gradient w.r.t weights and bias
        '''

        # Number of training examples.
        m = X.shape[0]

        # Gradient of loss w.r.t weights with regularization
        dw = (1/m)*np.dot(X.T, (Y - Y_label)) + self.l2 * self.W

        # Gradient of loss w.r.t bias with regularization
        db = (1/m)*np.sum((Y - Y_label))

        return dw, db

    def update(self, dW, db):
        self.W = self.W - self.learning_rate * dW
        self.b = self.b - self.learning_rate * db

    def train(self, X, Y_labels, iterations = 10):
        self.initialize(X.shape[1])
        losses = []
        gradients = []

        for _ in range(iterations):
            _losses = []
            _gradients = []
            for b in range(X.shape[0]//self.batch_size):
                b_X = X[b*self.batch_size:b*self.batch_size+self.batch_size, :]
                b_Y_labels = Y_labels[b*self.batch_size:b *
                                      self.batch_size+self.batch_size]
                Y = self.evaluate(b_X)
                _losses.append(binary_cross_entropy(
                    Y, b_Y_labels, self.W, self.l2))
                (dW, db) = self.gradient(b_X, Y, b_Y_labels)
                _gradients.append(dW)
                self.update(dW, db)
            losses.append(np.mean(_losses))
            gradients.append(np.mean(_gradients))

        return (losses, gradients)

### Pyspak Model

## Experiments


In [None]:
def make_roc(labels, results, name):
    labels_and_results = sorted(
        list(zip(labels, map(lambda x: x, results))), key=lambda x: x[1])

    labels_by_weights = np.array([k for (k, _) in labels_and_results])

    length = labels_by_weights.size

    true_positives = labels_by_weights.cumsum()

    num_positive = true_positives[-1]

    false_positives = np.arange(1.0, length + 1, 1.) - true_positives

    true_positives_rate = true_positives / num_positive
    false_positives_rate = false_positives / (length - num_positive)

    fig, ax = plt.subplots()
    ax.set_xlim(-.05, 1.05), ax.set_ylim(-.05, 1.05)
    ax.set_ylabel('True Positive Rate (Sensitivity)')
    ax.set_xlabel('False Positive Rate (1 - Specificity)')
    plt.plot(false_positives_rate, true_positives_rate,
             color='#8cbfd0', linestyle='-', linewidth=3.)
    plt.plot((0., 1.), (0., 1.), linestyle='--',
             color='#d6ebf2', linewidth=2.)

    plt.savefig('./data/{}_roc.png'.format(name))
    fig.clear()
    plt.close()

def plot_loss_gradient(iterations, train_losses, gradients, name):
    fig, ax = plt.subplots()
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Loss/Gradient')
    ax.set_title(name)
    ax.plot(range(iterations), train_losses, label='Loss')
    ax.plot(range(iterations), gradients, label='Gradient')
    ax.grid()
    ax.legend()

    fig.savefig("./data/{}.png".format(name))
    fig.clear()
    plt.close()

## Hyperparamters Tuning

In [12]:
grid = { 'iter': [100, 200, 500], 'lr': [0.001, 0.01, 0.1], 'l2': [0, 0.1, 0.001]}

params = list(itertools.product(*grid.values()))

params

[(100, 0.001, 0),
 (100, 0.001, 0.1),
 (100, 0.001, 0.001),
 (100, 0.01, 0),
 (100, 0.01, 0.1),
 (100, 0.01, 0.001),
 (100, 0.1, 0),
 (100, 0.1, 0.1),
 (100, 0.1, 0.001),
 (200, 0.001, 0),
 (200, 0.001, 0.1),
 (200, 0.001, 0.001),
 (200, 0.01, 0),
 (200, 0.01, 0.1),
 (200, 0.01, 0.001),
 (200, 0.1, 0),
 (200, 0.1, 0.1),
 (200, 0.1, 0.001),
 (500, 0.001, 0),
 (500, 0.001, 0.1),
 (500, 0.001, 0.001),
 (500, 0.01, 0),
 (500, 0.01, 0.1),
 (500, 0.01, 0.001),
 (500, 0.1, 0),
 (500, 0.1, 0.1),
 (500, 0.1, 0.001)]

In [None]:
class Evaluator:
    def __init__(self, params):
        pass

