In [None]:
#!sudo apt update
#!apt-get install openjdk-8-jdk-headless - qq > /dev/null
#!wget - q https: // dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz
#!tar xf spark-3.3.0-bin-hadoop2.tgz
#!pip install - r requirements.txt

import os
from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, FloatType
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col, udf, rand
import matplotlib.pyplot as plt
import math
import pyspark.sql as ps
from zlib import crc32
import time as tm
from datetime import datetime as dt
import itertools
from dataclasses import dataclass
from pyspark.sql import functions as F
from pyspark.rdd import RDD
from pyspark.broadcast import Broadcast
import findspark

In [None]:
path = './data'
worker_nodes = "*"
problem_to_solve = 'CANCELLED'

dataset_limit = 100000
use_all_dataset_frames = True
fold_number = 10
load_cached = False

# DA SCRIVERE
- perche' usiamo i dataframe invece degli rdd nella prima parte
- aggiungere k fold cross validation
- aggiungere griglia parametri
- aggiungere label stratification
- aggiungere performance modello pyspark
- aggiungere check e info extra su dataset di base (es sbilanciamento)
- auroc, auprc, f1, 
- confronto con tree classifier
- confrontare ogni pezzo con MLLib

## Data Download

In [None]:
os.environ['KAGGLE_USERNAME'] = "davidetricella"
os.environ['KAGGLE_KEY'] = "e1ab3aae4a07f36b37a3a8bace74d9df"


dataset = 'yuanyuwendymu/airline-delay-and-cancellation-data-2009-2018'
path = './data'

In [None]:
def download_dataset():
    if not os.path.isdir(path):
        os.mkdir(path)
    if not os.listdir(path):
        try:
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_files(dataset, path, unzip=True, quiet=False)
        except:
            print("Error downloading the dataset")

## Data Loading

In [None]:
dataframe_schema = StructType([
    StructField('FL_DATE', StringType(), True),
    StructField('OP_CARRIER', StringType(), True),
    StructField('ORIGIN', StringType(), True),
    StructField('DEST', StringType(), True),
    StructField('CRS_DEP_TIME', StringType(), True),
    StructField('CRS_ARR_TIME', StringType(), True),
    StructField('CANCELLED', StringType(), True),
    StructField('DIVERTED', StringType(), True),
    StructField('CRS_ELAPSED_TIME', StringType(), True),
    StructField('DISTANCE', StringType(), True)
])

columns_to_get = [
    'FL_DATE',
    'OP_CARRIER',
    'ORIGIN',
    'DEST',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CANCELLED',
    'DIVERTED',
    'CRS_ELAPSED_TIME',
    'DISTANCE'
]

findspark.init()
findspark.find()

spark = SparkSession.builder \
.appName("Airline Departure") \
.master('local[' + worker_nodes + ']') \
.getOrCreate()

context = spark.sparkContext

In [None]:
def load_dataset():
    data = spark.read.format("csv") \
        .option("header", True) \
        .load('./preprocessed/' + problem_to_solve)

    print('Preprocessed dataset loaded')
    return data

def save_dataset(data):
    data.write.format('csv').option('header', True).mode('overwrite').option(
        'sep', ',').save('./preprocessed/' + problem_to_solve)
    print('Preprocessed dataset saved')

def check_preprocessed_data_exists() -> bool:
    files = os.listdir('./data')
    for f in files:
        if f.startswith('preprocessed'):
            return True
    return False

def get_dataset(limit: float = -1, allFrames: bool = True):
    files = os.listdir(path)
    big_frame = spark.createDataFrame(
        spark.sparkContext.emptyRDD(), schema=dataframe_schema)
    if not allFrames:
        files = [files[0]]

    for f in files:
        if f.endswith('.csv'):
            frame = spark.read.option("header", True).csv(path + '/' + f)
            frame = frame.select(columns_to_get)
            frame = frame.orderBy(rand())

            if limit != -1:
                frame = frame.limit(limit)

            big_frame = frame.union(big_frame)

    big_frame = big_frame.withColumn(
        "id", monotonically_increasing_id()).orderBy(rand())
    big_frame.count()

    return big_frame

def print_and_save_time(s: str):
  #time_file.write(s + '\n')
  print(s)
#time_file = open("./output/times.txt", mode= "w")


## Preprocessing

In [None]:
columns_to_remove_for_canceled = [
    'DIVERTED',  # the flight has been diverted to an unplanned airport
]

columns_to_remove_for_diverted = [
    'CANCELLED',  # the flight has been cancelled
]

preprocess_columns_to_convert = [
    'OP_CARRIER',
    'ORIGIN',
    'DEST',
    'FL_DATE',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
    'CANCELLED',
    'DIVERTED',
    'index'
]

max_distance = 4970

#### Charts Plotting

In [None]:
def plot_balancing_chart(data: ps.DataFrame, label: str):
  total_positives = data.filter(col(label) == 1).count()
  total_negatives = data.filter(col(label) == 0).count()
  fig, ax = plt.subplots()

  labels = ['REGULAR', label]
  counts = [total_negatives, total_positives]
  bar_colors = ['tab:blue', 'tab:red']

  ax.bar(labels, counts, color=bar_colors)

  ax.set_ylabel('Counts')
  ax.set_title('Regular flights and problematic flights counts')

  plt.show()


#### Dataset Reading

In [None]:
download_dataset()
start_time = tm.time()
data = get_dataset(dataset_limit, use_all_dataset_frames).cache()

finish_time = tm.time() - start_time
print_and_save_time("Dataset reading concluded: " +
                    str(finish_time) + " seconds")

#### Null Rows Dropping

In [27]:
common_start_time = tm.time()

data = data.dropna(how='any')
print("Dataframe rows after NaN dropping: " + str(data.count()))

null_removal_finish_time = tm.time() - common_start_time
print_and_save_time("Null values removal concluded: " +
                    str(null_removal_finish_time) + " seconds")

#### Dataframe Balancing

In [None]:
start_time = tm.time()
irregular_flights = data.filter(col(problem_to_solve) == 1)

regular_flights = data.filter(col(problem_to_solve) == 0).limit(irregular_flights.count())

flight_ids = irregular_flights.rdd.map(lambda x: x.id).collect() + \
    regular_flights.rdd.map(lambda x: x.id).collect()

data = data.filter(data.id.isin(flight_ids)).orderBy(rand())
print("Balanced dataframe rows: " + str(data.count()))

finish_time = tm.time() - start_time
print_and_save_time("Dataset balancing concluded: " +
                    str(finish_time) + " seconds")

plot_balancing_chart(data, problem_to_solve)

#### Column Conversions

In [None]:
columns_start_time = tm.time()

@udf(returnType=DoubleType())
def str_to_float(s: str):
  encoding = "utf-8"
  b = s.encode(encoding)
  return float(crc32(b) & 0xffffffff) / 2**32

date_multiplier: float = 1 / 365
@udf(returnType=DoubleType())
def date_to_day_of_year(date_string) -> float:
  date = dt.strptime(date_string, "%Y-%m-%d")
  day = date.timetuple().tm_yday - 1
  return day * date_multiplier

@udf(returnType=DoubleType())
def time_to_interval(time) -> float:
  t = int(float(time))
  h = t // 100
  m = t % 100
  t = h * 60 + m
  return float(t / 1140)

distance_multiplier = float(1) / float(max_distance)

data = data.select(
  (data.CANCELLED.cast('double')).alias("CANCELLED"),
  (data.DIVERTED.cast('double')).alias("DIVERTED"),
  str_to_float(data.OP_CARRIER).alias("OP_CARRIER"),
  str_to_float(data.ORIGIN).alias("ORIGIN"),
  str_to_float(data.DEST).alias("DEST"),
  date_to_day_of_year(data.FL_DATE).alias("FL_DATE"),
  time_to_interval(data.CRS_DEP_TIME).alias("CRS_DEP_TIME"),
  time_to_interval(data.CRS_ARR_TIME).alias("CRS_ARR_TIME"),
  time_to_interval(data.CRS_ELAPSED_TIME).alias("CRS_ELAPSED_TIME"),
  (data.DISTANCE.cast('double') * distance_multiplier).alias("DISTANCE"),
  data.id
)

data.count()

columns_finish_time = tm.time() - columns_start_time
print_and_save_time("Columns conversion concluded: " +
                    str(columns_finish_time) + " seconds")


#### Z Score Normalization

In [None]:
z_start_time = tm.time()
column_list = data.columns
column_mean_dict = dict()
column_stddv_dict = dict()

for c in column_list:
    column_mean_dict[c] = data.agg({c: 'mean'}).head()[0]
    column_stddv_dict[c] = data.agg({c: 'stddev'}).head()[0]

data = data.select(
  problem_to_solve,

  ((data.OP_CARRIER - column_mean_dict["OP_CARRIER"]) / column_stddv_dict["OP_CARRIER"]).alias('OP_CARRIER'),

  ((data.ORIGIN - column_mean_dict["ORIGIN"]) / column_stddv_dict["ORIGIN"]).alias('ORIGIN'),

  ((data.DEST - column_mean_dict["DEST"]) / column_stddv_dict["DEST"]).alias('DEST'),

  ((data.FL_DATE - column_mean_dict["FL_DATE"]) / column_stddv_dict["FL_DATE"]).alias('FL_DATE'),

  ((data.CRS_DEP_TIME - column_mean_dict["CRS_DEP_TIME"]) / column_stddv_dict["CRS_DEP_TIME"]).alias('CRS_DEP_TIME'),

  ((data.CRS_ARR_TIME - column_mean_dict["CRS_ARR_TIME"]) /  column_stddv_dict["CRS_ARR_TIME"]).alias('CRS_ARR_TIME'),

  ((data.CRS_ELAPSED_TIME - column_mean_dict["CRS_ELAPSED_TIME"]) / column_stddv_dict["CRS_ELAPSED_TIME"]).alias('CRS_ELAPSED_TIME'),

  ((data.DISTANCE - column_mean_dict["DISTANCE"]) / column_stddv_dict["DISTANCE"]).alias('DISTANCE'),

  data.id
)

data.count()

z_finish_time = tm.time() - z_start_time
print_and_save_time("Z score normalization concluded: " +
                    str(z_finish_time) + " seconds")


#### Preprocessed dataset Saving/Loading

In [None]:
save_dataset(data)

In [None]:
if load_cached:
    data = load_dataset().cache()

#### Data Splitting

In [None]:
start_time = tm.time()
folds = []

k_elements_half_number = math.floor((data.count() / fold_number) / 2)

i = 0
while i < fold_number:
    k_positives = data.where(
        col(problem_to_solve) == 1).limit(k_elements_half_number)

    k_negatives = data.where(
        col(problem_to_solve) == 0).limit(k_elements_half_number)

    k_ids = k_positives.rdd.map(lambda x: x.id).collect() + \
        k_negatives.rdd.map(lambda x: x.id).collect()

    k_sample = data.filter(data.id.isin(k_ids))
    k_sample = k_sample.drop(k_sample.id)

    folds.append(k_sample)
    data = data.filter(~data.id.isin(k_ids))

    print("Split " + str(i + 1) + " of " + str(fold_number) + " completed")
    print("Dataframe rows: " + str(data.count()))
    i += 1

finish_time = tm.time() - start_time
print_and_save_time("Dataset splitting concluded: " +
                    str(finish_time) + " seconds")


#### Bonus: Pandas

In [None]:
def pandas_save_dataset(data):
    data.to_csv(path_or_buf=path + '/' + 'preprocessed.csv', index=False)
    print('Preprocessed dataset saved')

# Data Load

files = os.listdir(path)
data = pd.DataFrame()

for f in files:
    if f.endswith('.csv'):
        frame = pd.read_csv(filepath_or_buffer=path +
                            '/' + f, usecols=columns_to_get)
        frame.sample(frac=1)
        frame = frame.head(dataset_limit)
        data = pd.concat([data, frame])

data = data.dropna(how='any', axis='index')
print("Dataset acquisition completed")

# Problem Selection

irregulars = data.loc[data[problem_to_solve] == 1]
regulars = data.loc[data[problem_to_solve] == 0]

data = pd.concat([regulars.sample(len(irregulars)), irregulars]).sample(frac=1)

oppositeIndex = 'DIVERTED' if problem_to_solve == 'CANCELLED' else 'CANCELLED'
data = data.drop(oppositeIndex, axis=1)
print("Dataset balancing completed")

# Names Conversion

def str_to_float(s: str):
    encoding = "utf-8"
    b = s.encode(encoding)
    return float(crc32(b) & 0xffffffff) / 2**32

for c in ['OP_CARRIER', 'ORIGIN', 'DEST']:
    data[c] = data[c].apply(str_to_float)

# Dates Conversion

multiplier: float = 1 / 365

def date_to_day_of_year(date_string) -> float:
    date = dt.strptime(date_string, "%Y-%m-%d")
    day = date.timetuple().tm_yday - 1
    return day * multiplier

data["FL_DATE"] = data["FL_DATE"].apply(date_to_day_of_year)

# Time Conversion
    
def time_to_interval(time) -> float:
    t = int(float(time))
    h = t // 100
    m = t % 100
    t = h * 60 + m
    return float(t / 1140)

for c in ["CRS_DEP_TIME", "CRS_ARR_TIME", "CRS_ELAPSED_TIME"]:
    data[c] = data[c].apply(time_to_interval)

# Distance Conversion
    
multiplier: float = float(1) / float(max_distance)

data["DISTANCE"] = data["DISTANCE"].apply(lambda x: x * multiplier)

print("Dataset conversions completed")

#Z-score normalization

def z_score_normalize(x, m, s) -> float:
    return (x - m) / s

column_list = list(data)
column_list.remove(problem_to_solve)

for c in column_list:
    column_mean = data[c].mean()
    column_stddv = data[c].std()
    data[c] = data[c].apply(z_score_normalize, args=(column_mean, column_stddv))

print("Dataset normalization completed")
# Create Folds

folds = []

data.drop_duplicates(inplace=True)

irregulars = data.loc[data[problem_to_solve] == 1]
regulars = data.loc[data[problem_to_solve] == 0]

k_elements_half_number = round((len(data) / fold_number) / 2)

for i in range(1, fold_number + 1):
    k_irregulars_sample = irregulars.head(k_elements_half_number)
    k_regulars_sample = regulars.head(k_elements_half_number)
    k_sample = pd.concat([k_irregulars_sample, k_regulars_sample])

    folds.append(k_sample)
    irregulars = irregulars.drop(k_irregulars_sample.index)
    regulars = regulars.drop(k_regulars_sample.index)



## Models

### Generic Functions

In [None]:
def sigmoid(x):
    '''
    Calculates the sigmoid of the given data
    '''
    g = 1.0 / (1.0 + np.exp(-x))
    return g

def binary_cross_entropy(y, y_label, w, l2):
    '''
    Calculates the binary cross entropy loss of the calculated y and the given y_label
    '''
    loss = -np.mean(y_label*(np.log(y)) + (1-y_label)
                    * np.log(1-y)) + regularize(w, l2)
    return loss

def regularize(W, l2):
    '''
    Calculates the regularization term for the loss
    '''
    return (l2 / 2) * np.sum(np.square(W))

### Parallel Model

In [None]:
@dataclass
class ParallelLogisticRegression:
    iterations: int
    learning_rate: float
    batch_size: int
    l2: float
    W: Broadcast
    b: float

def parallel_initialize(self: ParallelLogisticRegression, feature_number: int):
    self.W = context.broadcast(np.random.rand(feature_number))
    self.b = np.random.rand()

def parallel_train(self: ParallelLogisticRegression, data: ps.DataFrame):

    if self.batch_size != 0:
        num_chunks = data.count() // self.batch_size
        chunk_percent = 1/num_chunks
        batches = data.randomSplit([chunk_percent] * num_chunks)
    else:
        batches = [data]

    batches_rdd = [b.rdd for b in batches]

    losses = []
    gradients = []

    for _ in range(self.iterations):
        _losses = []
        _gradients = []

        for batch in batches_rdd:
            batch = format_rdd(batch).cache()
            Y = parallel_evaluate(self, batch.map(lambda x: x[1]))
            _losses.append(parallel_binary_cross_entropy(self, batch.map(lambda x: x[0]).zip(Y)))
            (dW, db) = parallel_gradient(self, batch, Y)
            _gradients.append(dW)
            parallel_update(self, dW, db)
        losses.append(np.mean(_losses))
        gradients.append(np.mean(_gradients))

    return (losses, gradients)


def parallel_evaluate(self: ParallelLogisticRegression, X: RDD) -> RDD:
    Z: RDD = X.map(lambda x: np.dot(x, self.W.value))#.reduce(lambda a, b: a+b + self.b)
    Z = Z.map(lambda x: sigmoid(x))
    return Z

def parallel_binary_cross_entropy(self: ParallelLogisticRegression, X_Y: RDD)-> float:
    L: RDD = X_Y.map(lambda y: y[0] * np.log(y[1]) + (1 - y[0]) * np.log(1 -  y[1]))
    return -L.reduce(lambda a, b: a + b)/L.count() + regularize(self.W.value, self.l2)

def parallel_gradient(self: ParallelLogisticRegression, X: RDD, Y: RDD)-> tuple[np.ndarray, np.ndarray]:
    m = X.count()
    dw = X.zip(Y).map(lambda x: np.dot((x[1] - x[0][0]), x[0][1])).reduce(lambda a, b: (a + b) * 1/m + self.W.value * self.l2)
    db = X.zip(Y).map(lambda x: x[1] - x[0][0]).reduce(lambda a, b: a + b) * 1/m
    return dw, db

def parallel_update(self: ParallelLogisticRegression, dW: list[float], db: float):
        self.W = context.broadcast(self.W.value - self.learning_rate * dW)
        self.b = self.b - self.learning_rate * db

def format_rdd(rdd: RDD) -> RDD:
    return rdd.map(lambda x: (float(x[0]), [float(x[1]), float(x[2]), float(x[3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8])]))

In [None]:
m = ParallelLogisticRegression(100, 0.01, 0, 0.1, None, None)
f: pd.DataFrame = folds[0]

parallel_initialize(m, len(f.columns) - 1)
parallel_train(m, f)

### Serial Model

In [None]:
class SerialLogisticRegression():
    def __init__(self, iterations: int, learning_rate: float, batch_size: int, l2: float):
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.l2 = l2

    def initialize(self, columns_number):
        self.W = np.random.rand(columns_number)
        self.b = np.random.rand()

    def evaluate(self, X):
        Z = np.dot(X, self.W) + self.b
        Z = sigmoid(Z)
        return Z

    def gradient(self, X, Y, Y_label):
        '''
        Calculates the gradient w.r.t weights and bias
        '''

        # Number of training examples.
        m = X.shape[0]

        # Gradient of loss w.r.t weights with regularization
        dw = (1/m)*np.dot(X.T, (Y - Y_label)) + self.l2 * self.W

        # Gradient of loss w.r.t bias with regularization
        db = (1/m)*np.sum((Y - Y_label))

        return dw, db

    def update(self, dW, db):
        self.W = self.W - self.learning_rate * dW
        self.b = self.b - self.learning_rate * db

    def train(self, X, Y_labels, iterations = 10):
        self.initialize(X.shape[1])
        losses = []
        gradients = []

        for _ in range(iterations):
            _losses = []
            _gradients = []
            for b in range(X.shape[0]//self.batch_size):
                b_X = X[b*self.batch_size:b*self.batch_size+self.batch_size, :]
                b_Y_labels = Y_labels[b*self.batch_size:b *
                                      self.batch_size+self.batch_size]
                Y = self.evaluate(b_X)
                _losses.append(binary_cross_entropy(
                    Y, b_Y_labels, self.W, self.l2))
                (dW, db) = self.gradient(b_X, Y, b_Y_labels)
                _gradients.append(dW)
                self.update(dW, db)
            losses.append(np.mean(_losses))
            gradients.append(np.mean(_gradients))

        return (losses, gradients)

## Experiments


In [None]:
def make_roc(labels, results, name):
    labels_and_results = sorted(
        list(zip(labels, map(lambda x: x, results))), key=lambda x: x[1])

    labels_by_weights = np.array([k for (k, _) in labels_and_results])

    length = labels_by_weights.size

    true_positives = labels_by_weights.cumsum()

    num_positive = true_positives[-1]

    false_positives = np.arange(1.0, length + 1, 1.) - true_positives

    true_positives_rate = true_positives / num_positive
    false_positives_rate = false_positives / (length - num_positive)

    fig, ax = plt.subplots()
    ax.set_xlim(-.05, 1.05), ax.set_ylim(-.05, 1.05)
    ax.set_ylabel('True Positive Rate (Sensitivity)')
    ax.set_xlabel('False Positive Rate (1 - Specificity)')
    plt.plot(false_positives_rate, true_positives_rate,
             color='#8cbfd0', linestyle='-', linewidth=3.)
    plt.plot((0., 1.), (0., 1.), linestyle='--',
             color='#d6ebf2', linewidth=2.)

    plt.savefig('./data/{}_roc.png'.format(name))
    fig.clear()
    plt.close()

def plot_loss_gradient(iterations, train_losses, gradients, name):
    fig, ax = plt.subplots()
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Loss/Gradient')
    ax.set_title(name)
    ax.plot(range(iterations), train_losses, label='Loss')
    ax.plot(range(iterations), gradients, label='Gradient')
    ax.grid()
    ax.legend()

    fig.savefig("./data/{}.png".format(name))
    fig.clear()
    plt.close()

### Hyperparamters Tuning

In [None]:
grid = { 'iter': [100, 200, 500], 'lr': [0.001, 0.01, 0.1], 'l2': [0, 0.1, 0.001], 'batch_size': [0, 20]}

params = list(itertools.product(*grid.values()))

params

#### K-Fold Cross Validation
The following code defines a base class with train and evaluation methods to apply the K-Fold Cross Validation to each model

DIVIDIAMOLA IN 10, 8 + 1 NEL TRAINING E 1 PER EVALUATION FINALE?

In [None]:

class Evaluator:
    def __init__(self, iterations, lr, l2, batch_size):
        self.iterations = iterations
        self.lr = lr
        self.l2 = l2
        self.batch_size = batch_size

    def train(self, data: list[ps.DataFrame]):
        total_train_losses = []
        total_test_losses = []
        for i, fold in enumerate(data):
            test_data = fold
            remaining_folds = data[:i] + data[i + 1:]
            train_data: ps.DataFrame = remaining_folds[0]
            for train_fold in remaining_folds[1:]:
                train_data = train_data.union(train_fold)
            losses, gradient = self.train_impl(train_fold)
            total_train_losses.append(losses) 
            loss = self.test_impl(test_data)
            total_test_losses.append(loss)
            print("Test Loss for fold " + i + ": " + loss)
        return total_train_losses, total_test_losses

In [None]:
class ParallelModelEvaluator(Evaluator):
    def __init__(self, iterations, lr, l2, batch_size):
        super().__init__(iterations, lr, l2, batch_size)
        self.model = ParallelLogisticRegression(self.iterations, self.lr, self.batch_size, self.l2, None, None)
        parallel_initialize(self.model, 8)

    def train_impl(self, train_data: ps.DataFrame):
        return parallel_train(self.model, train_data)
    
    def test_impl(self, test_data: ps.DataFrame):
        test_data: RDD = format_rdd(test_data.rdd)
        value: RDD = parallel_evaluate(self.model, test_data)
        return parallel_binary_cross_entropy(self.model, test_data.map(lambda x: x[0]).zip(value))

In [None]:
class SequentialEvaluator(Evaluator):
    def __init__(self, iterations, lr, l2, batch_size):
        super().__init__(iterations, lr, l2, batch_size)
        self.model = SerialLogisticRegression(self.iterations, self.lr, self.batch_size)

    def train_impl(self, train_data: ps.DataFrame):
        pass

    def test_impl(self, test_data: ps.DataFrame):
        pass

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint

class MLibModelEvaluator(Evaluator):
    def __init__(self, iterations, lr, l2, batch_size):
        super().__init__(iterations, lr, l2, batch_size)
        self.model = None

    def train_impl(self, train_data: ps.DataFrame):
        labels = train_data.rdd.map(lambda x: LabeledPoint(label = x[0], features=x[1:]))        
        self.model = LogisticRegressionWithSGD.train(labels, iterations=self.iterations, regParam=self.lr, convergenceTol=0, validateData=False, intercept=True)
        return [], []

    def test_impl(self, test_data: ps.DataFrame):
        pass
        

In [None]:
eval = ParallelModelEvaluator(100, 0.01, 0.1, 0)
eval.train(folds)