In [9]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, FloatType
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col, udf, rand
import matplotlib.pyplot as plt
import math
import pyspark.sql as ps
from zlib import crc32
import time as tm
from datetime import datetime as dt
import itertools
from dataclasses import dataclass
from pyspark.sql import functions as F
from pyspark.rdd import RDD
from pyspark.broadcast import Broadcast
import findspark

In [10]:
path = './data'
worker_nodes = "*"
problem_to_solve = 'CANCELLED'

dataset_limit = 10000
use_all_dataset_frames = True
fold_number = 10

# DA SCRIVERE
- perche' usiamo i dataframe invece degli rdd nella prima parte
- aggiungere k fold cross validation
- aggiungere griglia parametri
- aggiungere label stratification
- aggiungere performance modello pyspark
- aggiungere check e info extra su dataset di base (es sbilanciamento)
- auroc, auprc, f1, 
- confronto con tree classifier

## Data Download

In [11]:
os.environ['KAGGLE_USERNAME'] = "davidetricella"
os.environ['KAGGLE_KEY'] = "e1ab3aae4a07f36b37a3a8bace74d9df"


dataset = 'yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018'
path = './data'

In [12]:
def download_dataset():
    if not os.path.isdir(path):
        os.mkdir(path)
    if not os.listdir(path):
        try:
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_files(dataset, path, unzip=True, quiet=False)
        except:
            print("Error downloading the dataset")

## Data Loading

In [13]:
dataframe_schema = StructType([
    StructField('FL_DATE', StringType(), True),
    StructField('OP_CARRIER', StringType(), True),
    StructField('ORIGIN', StringType(), True),
    StructField('DEST', StringType(), True),
    StructField('CRS_DEP_TIME', StringType(), True),
    StructField('CRS_ARR_TIME', StringType(), True),
    StructField('CANCELLED', StringType(), True),
    StructField('DIVERTED', StringType(), True),
    StructField('CRS_ELAPSED_TIME', StringType(), True),
    StructField('DISTANCE', StringType(), True)
])

columns_to_get = [
    'FL_DATE',
    'OP_CARRIER',
    'ORIGIN',
    'DEST',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CANCELLED',
    'DIVERTED',
    'CRS_ELAPSED_TIME',
    'DISTANCE'
]

findspark.init()
findspark.find()

spark = SparkSession.builder \
.appName("Airline Departure") \
.master('local[' + worker_nodes + ']') \
.getOrCreate()

context = spark.sparkContext

In [14]:
def load_dataset():
    data = spark.read.format("csv") \
        .option("header", True) \
        .load(path + '/preprocessed')

    print('Preprocessed dataset loaded')
    return data

def save_dataset(data):
    data.write.format('csv').option('header', True).mode('overwrite').option(
        'sep', ',').save(path + '/preprocessed')
    print('Preprocessed dataset saved')

def check_preprocessed_data_exists() -> bool:
    files = os.listdir('./data')
    for f in files:
        if f.startswith('preprocessed'):
            return True
    return False

def get_dataset(limit: float = -1, allFrames: bool = True):
    files = os.listdir(path)
    big_frame = spark.createDataFrame(
        spark.sparkContext.emptyRDD(), schema=dataframe_schema)
    if not allFrames:
        files = [files[0]]

    for f in files:
        if f.endswith('.csv'):
            frame = spark.read.option("header", True).csv(path + '/' + f)
            frame = frame.select(columns_to_get)
            frame = frame.orderBy(rand())

            if limit != -1:
                frame = frame.limit(limit)

            big_frame = frame.union(big_frame)

    big_frame = big_frame.select(
        "*").withColumn("index", monotonically_increasing_id())
    big_frame.count()

    return big_frame


## Preprocessing

In [15]:
columns_to_remove_for_canceled = [
    'DIVERTED',  # the flight has been diverted to an unplanned airport
]

columns_to_remove_for_diverted = [
    'CANCELLED',  # the flight has been cancelled
]

preprocess_columns_to_convert = [
    'OP_CARRIER',
    'ORIGIN',
    'DEST',
    'FL_DATE',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
    'CANCELLED',
    'DIVERTED',
    'index'
]

max_distance = 4970

In [16]:
#CHARTS PLOTTING
def plot_balancing_chart(data: ps.DataFrame, label: str):
  total_positives = data.filter(col(label) == 1).count()
  total_negatives = data.filter(col(label) == 0).count()
  fig, ax = plt.subplots()

  labels = ['REGULAR', label]
  counts = [total_negatives, total_positives]
  bar_colors = ['tab:blue', 'tab:red']

  ax.bar(labels, counts, color=bar_colors)

  ax.set_ylabel('Counts')
  ax.set_title('Regular flights and problematic flights counts')

  plt.show()


In [17]:
#PREPROCESSING EXECUTION
def print_and_save_time(s: str):
  time_file.write(s + '\n')
  print(s)
time_file = open("./data/times.txt", "w")

In [18]:
#DATASET READING
download_dataset()
start_time = tm.time()
data = get_dataset(dataset_limit, use_all_dataset_frames)
print("Dataframe rows: " + str(data.count()))

finish_time = tm.time() - start_time
print_and_save_time("Dataset reading concluded: " +
                    str(finish_time) + " seconds")

[Stage 43:>                                                       (0 + 16) / 16]

Dataframe rows: 110000
Dataset reading concluded: 25.13623833656311 seconds


                                                                                

In [19]:
# Nan ROWS DROPPING
common_start_time = tm.time()

data = data.dropna(how='any')
print("Dataframe rows after NaN dropping: " + str(data.count()))

null_removal_finish_time = tm.time() - common_start_time
print_and_save_time("Null values removal concluded: " +
                    str(null_removal_finish_time) + " seconds")



Dataframe rows after NaN dropping: 110000
Null values removal concluded: 26.10139751434326 seconds


                                                                                

In [20]:
#DATAFRAME BALANCING
start_time = tm.time()
irregular_flights = data.filter(col(problem_to_solve) == 1)

regular_flights = data.filter(col(problem_to_solve) == 0)

data = regular_flights.limit(irregular_flights.count()).\
    union(irregular_flights).\
    orderBy(rand())
print("Balanced dataframe rows: " + str(data.count()))

finish_time = tm.time() - start_time
print_and_save_time("Dataset balancing concluded: " +
                    str(finish_time) + " seconds")


[Stage 134:===>                                                   (1 + 15) / 16]

Balanced dataframe rows: 13164
Dataset balancing concluded: 75.5889539718628 seconds


                                                                                

In [21]:
#COLUMN CONVERSIONS
columns_start_time = tm.time()

@udf(returnType=DoubleType())
def str_to_float(s: str):
  encoding = "utf-8"
  b = s.encode(encoding)
  return float(crc32(b) & 0xffffffff) / 2**32

date_multiplier: float = 1 / 365
@udf(returnType=DoubleType())
def date_to_day_of_year(date_string) -> float:
  date = dt.strptime(date_string, "%Y-%m-%d")
  day = date.timetuple().tm_yday - 1
  return day * date_multiplier

@udf(returnType=DoubleType())
def time_to_interval(time) -> float:
  t = int(float(time))
  h = t // 100
  m = t % 100
  t = h * 60 + m
  return float(t / 1140)

distance_multiplier = float(1) / float(max_distance)

data = data.select(
  (data.CANCELLED.cast('double')).alias("CANCELLED"),
  (data.DIVERTED.cast('double')).alias("DIVERTED"),
  str_to_float(data.OP_CARRIER).alias("OP_CARRIER"),
  str_to_float(data.ORIGIN).alias("ORIGIN"),
  str_to_float(data.DEST).alias("DEST"),
  date_to_day_of_year(data.FL_DATE).alias("FL_DATE"),
  time_to_interval(data.CRS_DEP_TIME).alias("CRS_DEP_TIME"),
  time_to_interval(data.CRS_ARR_TIME).alias("CRS_ARR_TIME"),
  time_to_interval(data.CRS_ELAPSED_TIME).alias("CRS_ELAPSED_TIME"),
  (data.DISTANCE.cast('double') * distance_multiplier).alias("DISTANCE")
)
data.count()

columns_finish_time = tm.time() - columns_start_time
print_and_save_time("Columns conversion concluded: " +
                    str(columns_finish_time) + " seconds")




Columns conversion concluded: 48.093705892562866 seconds


                                                                                

In [22]:
data.printSchema()
data.show()

root
 |-- CANCELLED: double (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- OP_CARRIER: double (nullable = true)
 |-- ORIGIN: double (nullable = true)
 |-- DEST: double (nullable = true)
 |-- FL_DATE: double (nullable = true)
 |-- CRS_DEP_TIME: double (nullable = true)
 |-- CRS_ARR_TIME: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)



                                                                                

22/11/26 18:36:31 ERROR Executor: Exception in task 0.0 in stage 314.0 (TID 1744)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_4634/1496745023.py", line 13, in date_to_day_of_year
  File "/usr/lib/python3.9/_strptime.py", line 568, in _strptime_datetime
    tt, fraction, gmtoff_fraction = _strptime(data_string, format)
  File "/usr/lib/python3.9/_strptime.py", line 349, in _strptime
    raise ValueError("time data %r does not match format %r" %
ValueError: time data '0.5917808219178082' does not match format '%Y-%m-%d'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.s

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_4634/1496745023.py", line 13, in date_to_day_of_year
  File "/usr/lib/python3.9/_strptime.py", line 568, in _strptime_datetime
    tt, fraction, gmtoff_fraction = _strptime(data_string, format)
  File "/usr/lib/python3.9/_strptime.py", line 349, in _strptime
    raise ValueError("time data %r does not match format %r" %
ValueError: time data '0.5917808219178082' does not match format '%Y-%m-%d'


In [23]:
save_dataset(data)

Py4JJavaError: An error occurred while calling o423.save.
: ExitCodeException exitCode=1: chmod: changing permissions of '/mnt/c/Users/manue/Home/University/AirlineDeparture/data/preprocessed': Operation not permitted

	at org.apache.hadoop.util.Shell.runCommand(Shell.java:1007)
	at org.apache.hadoop.util.Shell.run(Shell.java:900)
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1212)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:1306)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:1288)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:209)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
data.printScheme()

In [None]:
plot_balancing_chart(data, problem_to_solve)

### Z Score Normalization

In [None]:
z_start_time = tm.time()
column_list = data.columns
column_mean_dict = dict()
column_stddv_dict = dict()

for c in column_list:
    column_mean_dict[c] = data.agg({c: 'mean'}).head()[0]
    column_stddv_dict[c] = data.agg({c: 'stddev'}).head()[0]

data = data.select(
  problem_to_solve,

  ((data.OP_CARRIER - column_mean_dict["OP_CARRIER"]) / column_stddv_dict["OP_CARRIER"]).alias('OP_CARRIER'),

  ((data.ORIGIN - column_mean_dict["ORIGIN"]) / column_stddv_dict["ORIGIN"]).alias('ORIGIN'),

  ((data.DEST - column_mean_dict["DEST"]) / column_stddv_dict["DEST"]).alias('DEST'),

  ((data.FL_DATE - column_mean_dict["FL_DATE"]) / column_stddv_dict["FL_DATE"]).alias('FL_DATE'),

  ((data.CRS_DEP_TIME - column_mean_dict["CRS_DEP_TIME"]) / column_stddv_dict["CRS_DEP_TIME"]).alias('CRS_DEP_TIME'),

  ((data.CRS_ARR_TIME - column_mean_dict["CRS_ARR_TIME"]) /  column_stddv_dict["CRS_ARR_TIME"]).alias('CRS_ARR_TIME'),

  ((data.CRS_ELAPSED_TIME - column_mean_dict["CRS_ELAPSED_TIME"]) / column_stddv_dict["CRS_ELAPSED_TIME"]).alias('CRS_ELAPSED_TIME'),

  ((data.DISTANCE - column_mean_dict["DISTANCE"]) / column_stddv_dict["DISTANCE"]).alias('DISTANCE'),
)
data.count()

z_finish_time = tm.time() - z_start_time
print_and_save_time("Z score normalization concluded: " +
                    str(z_finish_time) + " seconds")


In [18]:
#DATA SPLITTING
start_time = tm.time()
split_list = []

#data = data.dropDuplicates()
#print("Dataframe rows after duplicates dropping: " + str(data.count()))

k_elements_half_number = math.floor((data.count() / fold_number) / 2)
print("Batch elements: " + str(k_elements_half_number))

i = 0
while i < fold_number:
    k_positive_sample = data.where(
        col(problem_to_solve) == 1).limit(k_elements_half_number)

    k_negative_sample = data.where(
        col(problem_to_solve) == 0).limit(k_elements_half_number)

    k_sample = k_positive_sample.union(k_negative_sample)
    print("Total k sample rows: " + str(k_sample.count()))

    split_list.append(k_sample)
    data = data.subtract(k_sample)

    print("Split " + str(i + 1) + " of " + str(fold_number) + " completed")
    print("Dataframe rows: " + str(data.count()))
    i += 1

finish_time = tm.time() - start_time
print_and_save_time("Dataset splitting concluded: " +
                    str(finish_time) + " seconds")


PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Users\manue\AppData\Local\Temp\ipykernel_23328\1496745023.py", line 13, in date_to_day_of_year
  File "c:\Users\manue\AppData\Local\Programs\Python\Python310\lib\_strptime.py", line 568, in _strptime_datetime
    tt, fraction, gmtoff_fraction = _strptime(data_string, format)
  File "c:\Users\manue\AppData\Local\Programs\Python\Python310\lib\_strptime.py", line 349, in _strptime
    raise ValueError("time data %r does not match format %r" %
ValueError: time data '0.873972602739726' does not match format '%Y-%m-%d'


Bonus: Pandas

In [None]:
def pandas_load_dataset():
    
    print('Preprocessed dataset loaded')
    return data

def pandas_save_dataset(data):
    data.to_csv(path_or_buf=path + '/' + 'preprocessed.csv', index=False)
    print('Preprocessed dataset saved')

# Data Load

files = os.listdir(path)
data = pd.DataFrame()

for f in files:
    if f.endswith('.csv'):
        frame = pd.read_csv(filepath_or_buffer=path +
                            '/' + f, usecols=columns_to_get)
        data = pd.concat([data, frame])

data = data.dropna(how='any', axis='index')
        
# Problem Selection

irregulars = data.filter([problem_to_solve], like=1)
regulars = data.filter([problem_to_solve], like=0)

data = pd.concat([regulars.sample(len(irregulars)), irregulars]).sample(frac=1)

oppositeIndex = 'DIVERTED' if problem_to_solve == 'CANCELLED' else 'CANCELLED'
data = data.drop(oppositeIndex, axis=1)

# Names Conversion

def str_to_float(s: str):
    encoding = "utf-8"
    b = s.encode(encoding)
    return float(crc32(b) & 0xffffffff) / 2**32

for c in ['OP_CARRIER', 'ORIGIN', 'DEST']:
    data[c] = data[c].apply(str_to_float)

# Dates Conversion

multiplier: float = 1 / 365

def date_to_day_of_year(date_string) -> float:
    date = dt.strptime(date_string, "%Y-%m-%d")
    day = date.timetuple().tm_yday - 1
    return day * multiplier

for i in date_columns_to_convert:
    data[i] = data[i].apply(date_to_day_of_year)

# Time Conversion
    
def time_to_interval(time) -> float:
    t = int(float(time))
    h = t // 100
    m = t % 100
    t = h * 60 + m
    return float(t / 1140)

for c in time_columns_to_convert:
    data[c] = data[c].apply(time_to_interval)

# Distance Conversion
    
multiplier: float = float(1) / float(max_distance)

for c in numeric_columns_to_convert:
    data[c] = data[c].apply(lambda x: x * multiplier)

# Create Folds

folds = []

data.drop_duplicates(inplace=True)

irregulars = data.filter([problem_to_solve], like=1)
regulars = data.filter([problem_to_solve], like=0)

k_elements_half_number = round((len(data) / fold_number) / 2)

for i in range(1, fold_number + 1):
    k_irregulars_sample = irregulars.head(k_elements_half_number)
    k_regulars_sample = regulars.head(k_elements_half_number)
    k_sample = pd.concat([k_irregulars_sample, k_regulars_sample])

    folds.append(k_sample.to_numpy())
    irregulars = irregulars.drop(k_irregulars_sample.index)
    regulars = regulars.drop(k_regulars_sample.index)



## Models

### Generic Functions

In [None]:
def sigmoid(x):
    '''
    Calculates the sigmoid of the given data
    '''
    g = 1.0 / (1.0 + np.exp(-x))
    return g

def binary_cross_entropy(y, y_label, w, l2):
    '''
    Calculates the binary cross entropy loss of the calculated y and the given y_label
    '''
    loss = -np.mean(y_label*(np.log(y)) + (1-y_label)
                    * np.log(1-y)) + regularize(w, l2)
    return loss

def regularize(W, l2):
    '''
    Calculates the regularization term for the loss
    '''
    return (l2 / 2) * np.sum(np.square(W))

### Parallel Model

In [None]:
@dataclass
class ParallelLogisticRegression:
    iterations: int
    learning_rate: float
    batch_size: int
    l2: float
    W: Broadcast
    b: float

def initialize(self: ParallelLogisticRegression, size):
    self.W = context.broadcast(np.random.rand(size))
    self.b = np.random.rand()


def parallel_train(self: ParallelLogisticRegression, data: ps.DataFrame):
    initialize(self, len(data.columns) - 1)

    num_chunks = X.count() // self.batch_size
    chunk_percent = 1/num_chunks

    batches = data.randomSplit([chunk_percent] * num_chunks)

    Y_labels = [b.select(problem_to_solve).rdd for b in batches]
    X = [b.drop(problem_to_solve).rdd for b in batches]

    losses = []
    gradients = []

    for _ in range(self.iterations):
        _losses = []
        _gradients = [] 

        for b_X, b_Y_labels in zip(X, Y_labels):
            Y = self.evaluate(b_X)
            _losses.append(binary_cross_entropy(
                Y, b_Y_labels, self.W, self.l2))
            (dW, db) = self.gradient(b_X, Y, b_Y_labels)
            _gradients.append(dW)
            self.update(dW, db)
        losses.append(np.mean(_losses))
        gradients.append(np.mean(_gradients))

        return (losses, gradients)

def parallel_evaluate(self: ParallelLogisticRegression, X: RDD):
    Z = X.map(lambda x: np.dot(x, self.W.value)).reduce(lambda a, b: a+b + self.b)
    Z = Z.map(lambda x: sigmoid(x))
    return Z
        
def update(self, dW: list[float], db: float):
        self.W = context.broadcast(self.W.value - self.learning_rate * dW)
        self.b = self.b - self.learning_rate * db

### Serial Model

In [None]:
class SerialLogisticRegression():
    def __init__(self, iterations: int, learning_rate: float, batch_size: int, l2: float):
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.l2 = l2

    def initialize(self, columns_number):
        self.W = np.random.rand(columns_number)
        self.b = np.random.rand()

    def evaluate(self, X):
        Z = np.dot(X, self.W) + self.b
        Z = sigmoid(Z)
        return Z

    def gradient(self, X, Y, Y_label):
        '''
        Calculates the gradient w.r.t weights and bias
        '''

        # Number of training examples.
        m = X.shape[0]

        # Gradient of loss w.r.t weights with regularization
        dw = (1/m)*np.dot(X.T, (Y - Y_label)) + self.l2 * self.W

        # Gradient of loss w.r.t bias with regularization
        db = (1/m)*np.sum((Y - Y_label))

        return dw, db

    def update(self, dW, db):
        self.W = self.W - self.learning_rate * dW
        self.b = self.b - self.learning_rate * db

    def train(self, X, Y_labels, iterations = 10):
        self.initialize(X.shape[1])
        losses = []
        gradients = []

        for _ in range(iterations):
            _losses = []
            _gradients = []
            for b in range(X.shape[0]//self.batch_size):
                b_X = X[b*self.batch_size:b*self.batch_size+self.batch_size, :]
                b_Y_labels = Y_labels[b*self.batch_size:b *
                                      self.batch_size+self.batch_size]
                Y = self.evaluate(b_X)
                _losses.append(binary_cross_entropy(
                    Y, b_Y_labels, self.W, self.l2))
                (dW, db) = self.gradient(b_X, Y, b_Y_labels)
                _gradients.append(dW)
                self.update(dW, db)
            losses.append(np.mean(_losses))
            gradients.append(np.mean(_gradients))

        return (losses, gradients)

## Experiments


In [None]:
def make_roc(labels, results, name):
    labels_and_results = sorted(
        list(zip(labels, map(lambda x: x, results))), key=lambda x: x[1])

    labels_by_weights = np.array([k for (k, _) in labels_and_results])

    length = labels_by_weights.size

    true_positives = labels_by_weights.cumsum()

    num_positive = true_positives[-1]

    false_positives = np.arange(1.0, length + 1, 1.) - true_positives

    true_positives_rate = true_positives / num_positive
    false_positives_rate = false_positives / (length - num_positive)

    fig, ax = plt.subplots()
    ax.set_xlim(-.05, 1.05), ax.set_ylim(-.05, 1.05)
    ax.set_ylabel('True Positive Rate (Sensitivity)')
    ax.set_xlabel('False Positive Rate (1 - Specificity)')
    plt.plot(false_positives_rate, true_positives_rate,
             color='#8cbfd0', linestyle='-', linewidth=3.)
    plt.plot((0., 1.), (0., 1.), linestyle='--',
             color='#d6ebf2', linewidth=2.)

    plt.savefig('./data/{}_roc.png'.format(name))
    fig.clear()
    plt.close()

def plot_loss_gradient(iterations, train_losses, gradients, name):
    fig, ax = plt.subplots()
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Loss/Gradient')
    ax.set_title(name)
    ax.plot(range(iterations), train_losses, label='Loss')
    ax.plot(range(iterations), gradients, label='Gradient')
    ax.grid()
    ax.legend()

    fig.savefig("./data/{}.png".format(name))
    fig.clear()
    plt.close()

### Hyperparamters Tuning

In [None]:
grid = { 'iter': [100, 200, 500], 'lr': [0.001, 0.01, 0.1], 'l2': [0, 0.1, 0.001], 'batch_size': [0, 20]}

params = list(itertools.product(*grid.values()))

params

#### K-Fold Cross Validation
The following code defines a base class with train and evaluation methods to apply the K-Fold Cross Validation to each model

DIVIDIAMOLA IN 10, 8 + 1 NEL TRAINING E 1 PER EVALUATION FINALE?

In [None]:

class Evaluator:
    def __init__(self, iterations, lr, l2, batch_size):
        self.iterations = iterations
        self.lr = lr
        self.l2 = l2
        self.batch_size = batch_size

    def train(self, data):
        total_train_losses = []
        total_test_losses = []
        for i, fold in enumerate(data):
            train_data = data[:i] + data[i + 1:]
            test_data = fold
            losses,  = self.train_impl(train_data)
            total_train_losses.append(losses) 
            loss = self.test_impl(test_data)
            total_test_losses.append(loss)
        return total_train_losses, total_test_losses

In [None]:
class ParallelModelEvaluator(Evaluator):
    def __init__(self, iterations, lr, l2, batch_size):
        super().__init__(iterations, lr, l2, batch_size)
        self.model = ParallelLogisticRegression(self.iterations, self.lr, self.batch_size)

    def train_impl(self, train_data: ps.DataFrame):
        return parallel_train(self.model, train_data)
    
    def test_impl(self, test_data: ps.DataFrame):
        value: RDD = parallel_evaluate(self.model, test_data)
        return binary_cross_entropy(np.array(value.collect()), y_label, self.model.W.value, self.l2)

In [None]:
class SequentialEvaluator(Evaluator):
    def __init__(self, iterations, lr, l2, batch_size):
        super().__init__(iterations, lr, l2, batch_size)
        self.model = SerialLogisticRegression(self.iterations, self.lr, self.batch_size)

    def train_impl(self, train_data: ps.DataFrame):
        pass

    def test_impl(self, test_data: ps.DataFrame):
        pass

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint

class MLibModelEvaluator(Evaluator):
    def __init__(self, iterations, lr, l2, batch_size):
        super().__init__(iterations, lr, l2, batch_size)
        self.model = None

    def train_impl(self, train_data: ps.DataFrame):
        labels = train_data.rdd.map(lambda x: LabeledPoint(label = x[0], features=x[1:]))        
        self.model = LogisticRegressionWithSGD.train(labels, iterations=self.iterations, regParam=self.lr, convergenceTol=0, validateData=False, intercept=True)
        return [], []

    def test_impl(self, test_data: ps.DataFrame):
        return test_data.rdd.map(lambda x: float(self.model.predict(x))).reduce(lambda x, y: x + y) / test_data.rdd.count()
        