In [146]:
# General
import pandas as pd
import numpy as np
import os
import glob

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

# Modelling
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
## Linear regression
from sklearn.linear_model import LinearRegression
## SVR
from sklearn.svm import LinearSVR, SVR
## Neural networks
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras import Sequential
from tensorflow.keras import optimizers
## LGBM
from tensorflow.keras.callbacks import EarlyStopping

In [221]:
# Get rid of annoying LGBM messages
import warnings
warnings.filterwarnings("ignore", message="categorical_column in param dict is overridden.")
warnings.filterwarnings("ignore", message='Overriding the parameters from Reference Dataset.')
warnings.filterwarnings("ignore", message='The reported value is ignored because this*')
warnings.filterwarnings("ignore", message='Found `n_estimators` in params. Will use it*')
warnings.filterwarnings("ignore", message='The distribution is specified by*')

# Hide optuna logging too
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Pre-processing

In [5]:
# Get files to read in
gcse_files = glob.glob("../fake_data/synthetic_*_gcse_20[1-2][0, 8-9].csv")
npd_files = glob.glob("../fake_data/synthetic_npd_ks4_student_20[1-2][0, 8-9].csv")

## Exam Data

In [6]:
def process_grades(data = pd.DataFrame, grade_col = str):
    
    # Drop rows with missing grades
    data = data.dropna(subset = grade_col)
    # Convert U grade to 0
    data.loc[data[grade_col] == "U", grade_col] = "0"
    # Convert grades to numeric from string format
    data = data[data[grade_col].isin([str(x) for x in (range(0, 10))])]
    data[grade_col] = data[grade_col].astype(float)
    return data

In [7]:
def process_gcse_data(df = pd.DataFrame):
    
    """
    Takes raw GCSE exam data (2017-2020 files), filters it
    appropriately and processes it. 
    Returns a DataFrame with a reduced number of columns.
    Full steps taken can be seen in code commenting or in
    Methodology section of capstone.
    --------------------------------------------------
    df = DataFrame of raw GCSE data
    """
    
    # Copy to prevent in-place changes
    data = df.copy()
    
    # Make cols lowercase
    data.columns = [x.lower() for x in data.columns]
    
    # Reformat examseries to year col
    data["year"] = data.examseries.apply(lambda x: x.split()[1])
    
    # Remove candidates who were not 16 on 31st August
    data = data.query("yearendage == 16")
    # Remove private candidates
    data = data.query("privatecandidate == False")
    # Commented out below since all True in synthetic data
    # Remove partial absentees
#     data = data.query("partialabsence == False")
    # Remove candidates without prior attainment or that weren't matched in NPD
    data = data.dropna(subset = ["normalisedks2score", "npdmatchround"])
    
    # Remove candidates with 0 prior attainment (errors in data)
    data = data[data.normalisedks2score > 0]
    
    # Remove non-reformed GCSEs
    data = data[data.reformphase.isin(['Ofqual-regulated Phase 1 reformed GCSE FC',
                                       'Ofqual-regulated Phase 2 reformed GCSE FC'])]
    # Recode tier into foundation or not foundation
    data.loc[data.tier != "F", "tier"] = "Not F"
    
    # Process grade column inplace
    data = process_grades(data, grade_col = "grade")
    
    # Standardise the KS2 prior attainment to between 0 and 1
    scaler = MinMaxScaler()
    data.normalisedks2score = scaler.fit_transform(data[['normalisedks2score']])
    
    # Get candidates who took at least 8 GCSEs
    grouped = data.groupby("uidp").count()
    at_least_8 = set(grouped[grouped.examseries >= 8].index.to_list())
    # Get candidates who took English and Maths
    eng_math = set(data[data.jcqtitle.isin(["Mathematics", "English language"])].uidp)
    # Get candidates who took English and Maths and >= 8 GCSEs
    filtered_ids = at_least_8 & eng_math
    # Beware that since this is simulated data, it's wrong
    filtered = data[data.uidp.isin(filtered_ids)]
    
    # Select cols needed for modelling and dropnas
    gcse_cols = ["uidp", "year", "jcqtitle", "tier", "centretypedesc",
                 "normalisedks2score", "grade", "centreassessmentgrade"]
    filtered = filtered[gcse_cols]

    return filtered

In [8]:
# Load and process all the GCSE exam data
gcse_data = pd.DataFrame()
# Iterate through files
for file in gcse_files:
    # Perform filtering/pre-processing
    year_df = process_gcse_data(pd.read_csv(file))
    # Process the CAG column too
    if "2020" in file:
        year_df = process_grades(year_df, "centreassessmentgrade")
    # Create dummy value for other years
    else:
        year_df.centreassessmentgrade = np.NaN
        
    # Merge with other years
    gcse_data = pd.concat([gcse_data, year_df])
    # Delete var to save memory
    del year_df
# Reset index
gcse_data = gcse_data.reset_index(drop = True)

## NPD Data

In [9]:
def process_npd(data = pd.DataFrame):
    
    """
    Takes raw NPD data (2017-2020 files), filters it
    appropriately and processes it. 
    Returns a DataFrame with a reduced number of columns.
    Full steps taken can be seen in code commenting or in
    Methodology section of capstone.
    --------------------------------------------------
    df = DataFrame of raw NPD data
    """    
    
    # Copy to prevent inplace changes
    df = data.copy()
    # Make cols lowercase
    df.columns = [x.lower() for x in df.columns]
    # Select the columns that are common across files
    npd_cols = ["uidp", "ks4_ealgrp_ptq_ee", "ks4_gender"]
    # Get the bases for the columns that change in suffix in each file
    col_bases = ["ethnicgroupmajor", "fsmeligible", "senprovisionmajor"]
    # Get the suffix part that changes
    year_ending = int(file[-6:-4])
    # Dynamically select those cols with changing suffixes
    npd_cols.extend([col_base + f"_spr{year_ending}" for col_base in col_bases])
    # Also add in most recent IDACI score
    npd_cols.append(sorted([x for x in df.columns if "idaciscore" in x])[-1])
    
    # Select the needed columns
    df = df[npd_cols]
    # Add in year col
    df["year"] = f"20{year_ending}"
    # Rename columns
    clean_cols = ["uidp", "eal", "gender", "ethnicity",
              "fsm", "sen", "idaci", "year"]
    df.columns = clean_cols
    
    return df

In [10]:
col_dict = dict()
for file in npd_files:
    col_dict[file[-8:-4]] = pd.read_csv(file).columns

In [11]:
# set(col_dict["2020"]) & set(col_dict["2019"]) & set(col_dict["2018"])

In [12]:
# set(col_dict["2020"]) - set(col_dict["2019"])

In [13]:
# Create df to store each year's data in
npd_data = pd.DataFrame()

# Iterate through files
for file in npd_files:
    # Load data
    df = pd.read_csv(file)
    # Process the NPD data
    df = process_npd(df)
    # Combine into dataframe
    npd_data = pd.concat([npd_data, df])

# Joining

In [14]:
def recode_cols(data = pd.DataFrame):
    """
    Takes processed merged GCSE exam and NPD data (2017-2020 files),
    filters it appropriately and processes it. 
    It recodes several columns into fewer numbers of categories
    to make modelling easier.
    Returns a DataFrame with a reduced number of columns.
    Full steps taken can be seen in code commenting or in
    Methodology section of capstone.
    --------------------------------------------------
    df = DataFrame of merged NPD/GCSE data
    """
    
    # Copy to prevent inplace changes
    df = data.copy()
    # Filter EAL to remove NAs or unclassifieds
    df = df[df.eal.isin([1,2])]
    # Filter ethnicity to remove unclassifieds/NaNs
    df = df[df.ethnicity.isin(["AOEG", "ASIA", "BLAC", "CHIN",
                          "MIXD", "WHIT"])]
    # Filter and recode SEN to remove unclassifieds and make SEN/not SEN
    df = df[df.sen.isin(["1_NON", "2_SNS", "3_SS"])]
    df.loc[df.sen != "1_NON", "sen"] = "SEN"
    df.loc[df.sen == "1_NON", "sen"] = "No SEN"
    
    # Drop remaining NaNs from FSM and IDACI cols
    df = df.dropna(subset = ["fsm", "idaci"])
    
    return df

In [15]:
# Inner join exam data with NPD data
merged = npd_data.merge(gcse_data, on = ["uidp", "year"],
                       how = "inner")

# Recode columns and filter further
df = recode_cols(merged)

# Drop now unnecesary UIDP and year cols
df = df.drop(columns = ["year", "uidp"])

# Convert categorical cols to numerics
categorical_cols = ["eal", "gender", "ethnicity", "fsm",
               "sen", "jcqtitle", "tier", "centretypedesc"]
# Fit encoder on categorical cols
encoder = OrdinalEncoder()
encoder.fit(df[categorical_cols])

# Create a mapping for reference later
mapping = {k:v for k, v in zip(categorical_cols, encoder.categories_)}

# Convert categoricals to numerics
df[categorical_cols] = encoder.transform(df[categorical_cols])

# Split into treatment and control
treatment = df[~df.centreassessmentgrade.isna()]
control = df[df.centreassessmentgrade.isna()]
# Old code, maybe useful for LGBM
# df[categorical_cols] = df[categorical_cols].apply(pd.Categorical)

In [189]:
# Split into labels and features
X = np.array(control.iloc[:, :10], dtype = "float32")
y = np.array(control.grade, dtype = "float32")

# Split into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                   shuffle = True,
                                                   random_state = 42)

# Quick EDA / Check

In [17]:
from pandas_profiling import ProfileReport

In [18]:
report = ProfileReport(df, title = "eda_check")
report.to_file("eda_check.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Modelling

In [19]:
# Create dataframe to store model results in
all_results = pd.DataFrame()

In [67]:
def evaluate_model(X_train, X_test,
                  y_train, y_test,
                  model, model_name):
    
    """
    Function to evaluate a model in terms of
    train and test RMSE.
    Returns a dataframe of model name and RMSEs.
    --------------------------------------------------
    X_train = np.array of X data, used to generate train RMSE
    X_test = np.array of X data, used to generate test RMSE
    y_train = np.array of y data, used to generate train RMSE
    y_test = np.array of y data, used to generate test RMSE
    model = fitted model instance to use with model.predict
    model_name = str, name to save the model under
    """
    # Generate predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    # Evaluate model
    train_rmse = mean_squared_error(y_train, train_preds, squared = False)
    test_rmse = mean_squared_error(y_test, test_preds, squared = False)

    # Store results
    results = pd.DataFrame({"model": model_name,
                            "train_rmse": train_rmse,
                            "test_rmse": test_rmse,
                 }, index = [0])
    
    return results

## Linear Model

In [68]:
# Create linear model
model = LinearRegression()
model.fit(X_train, y_train)
# Evaluate model, getting test and train RMSE
results = evaluate_model(X_train, X_test,
                         y_train, y_test,
                         model, "linear")
# Store results
all_results = pd.concat([all_results, results])

## Neural Network

In [61]:
def build_mlp(X_data,
              layer_1_units = 64,
              layer_2_units = 64,
              batch_normalization = False,
              loss = "mse",
              optimizer = "adam",
              metrics = ["mse"]):
    """
    Function to create artificial neural network. Dense layer
    units can be specified, as can the use of batch normalization
    in between the dense layers (this provides mild regularisation)
    and may speed up training.
    Returns a compiled Keras model.
    --------------------------------------------------
    X_data = np.array of X data, used to give input shape to model
    layer_1_units = int, number of neurons in 1st hidden layer
    layer_2_units = int, number of neurons in 2nd hidden layer
    batch_normalization = bool, batch normalize between hidden layers 
    if true
    loss = str, name of loss function to use
    optimizer = str or keras.Optimzer object, optimizer to use
    metrics = list of strings, evaluation metrics to use
    """
    # Build model
    model = Sequential(name = "MLP")
    # 1st Dense layer
    model.add(Dense(units = layer_1_units, activation = "relu", input_shape = (X_data.shape[1], ),
                   kernel_initializer = "he_normal"))
    
    # Add batch normalization if desired
    if batch_normalization:
        model.add(BatchNormalization())
    
    # 2nd Dense layer
    model.add(Dense(units = layer_2_units, activation = "relu",
                   kernel_initializer = "he_normal"))
    # Output layer
    model.add(Dense(units = 1, activation = "linear",
                   kernel_initializer = "he_normal"))
    # Compile model
    model.compile(**compile_hp)
    
    return model

In [53]:
# Hyperparams used during modelling
# Compilation hyperparams
compile_hp = dict()
compile_hp["loss"] = "mse"
compile_hp["optimizer"] = optimizers.Adam(learning_rate = 0.001)
compile_hp["metrics"] = ["mse"]

# Fitting hyperparams
fit_hp = dict()
fit_hp["batch_size"] = 32
fit_hp["epochs"] = 200
fit_hp["validation_split"] = 0.2
# Create callback to select the best model
fit_hp["callbacks"] = EarlyStopping(monitor = "val_loss",
                                         mode = "min",
                                         restore_best_weights = True,
                                         patience = 25)

# Eliminate verbose to have a neater notebook 
fit_hp["verbose"] = 2

### NN 1

In [64]:
# Select number of hidden units
layer_1_units = 64
layer_2_units = 64
# Select whether to batch normalize
batch_normalization = True

# Build and compile model
model = build_mlp(X_train,
                  layer_1_units = layer_1_units,
                  layer_2_units = layer_2_units,
                  batch_normalization = batch_normalization,
                  **compile_hp)
# Fit model
history = model.fit(X_train, y_train, **fit_hp)

# Evaluate model, getting test and train RMSE
results = evaluate_model(X_train, X_test,
                         y_train, y_test,
                         model, "neural_network-64_64_init_bn")
# Store results
all_results = pd.concat([all_results, results])

Epoch 1/200


2022-07-23 14:27:19.023567: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


3/3 - 1s - loss: 20.0784 - mse: 20.0784 - val_loss: 16.9620 - val_mse: 16.9620 - 1s/epoch - 353ms/step
Epoch 2/200
3/3 - 0s - loss: 13.0802 - mse: 13.0802 - val_loss: 22.1561 - val_mse: 22.1561 - 32ms/epoch - 11ms/step
Epoch 3/200
3/3 - 0s - loss: 6.9596 - mse: 6.9596 - val_loss: 49.4990 - val_mse: 49.4990 - 34ms/epoch - 11ms/step
Epoch 4/200
3/3 - 0s - loss: 5.0195 - mse: 5.0195 - val_loss: 77.0165 - val_mse: 77.0165 - 33ms/epoch - 11ms/step
Epoch 5/200
3/3 - 0s - loss: 5.5998 - mse: 5.5998 - val_loss: 64.2733 - val_mse: 64.2733 - 32ms/epoch - 11ms/step
Epoch 6/200


2022-07-23 14:27:19.559862: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


3/3 - 0s - loss: 5.4465 - mse: 5.4465 - val_loss: 43.6236 - val_mse: 43.6236 - 31ms/epoch - 10ms/step
Epoch 7/200
3/3 - 0s - loss: 5.6512 - mse: 5.6512 - val_loss: 24.6170 - val_mse: 24.6170 - 31ms/epoch - 10ms/step
Epoch 8/200
3/3 - 0s - loss: 4.1342 - mse: 4.1342 - val_loss: 14.7795 - val_mse: 14.7795 - 37ms/epoch - 12ms/step
Epoch 9/200
3/3 - 0s - loss: 4.3402 - mse: 4.3402 - val_loss: 11.8381 - val_mse: 11.8381 - 33ms/epoch - 11ms/step
Epoch 10/200
3/3 - 0s - loss: 4.0281 - mse: 4.0281 - val_loss: 10.5440 - val_mse: 10.5440 - 32ms/epoch - 11ms/step
Epoch 11/200
3/3 - 0s - loss: 3.8218 - mse: 3.8218 - val_loss: 10.0687 - val_mse: 10.0687 - 34ms/epoch - 11ms/step
Epoch 12/200
3/3 - 0s - loss: 3.6803 - mse: 3.6803 - val_loss: 9.5612 - val_mse: 9.5612 - 35ms/epoch - 12ms/step
Epoch 13/200
3/3 - 0s - loss: 3.4486 - mse: 3.4486 - val_loss: 9.4825 - val_mse: 9.4825 - 35ms/epoch - 12ms/step
Epoch 14/200
3/3 - 0s - loss: 3.1396 - mse: 3.1396 - val_loss: 9.3816 - val_mse: 9.3816 - 34ms/epoch

Epoch 79/200
3/3 - 0s - loss: 1.7972 - mse: 1.7972 - val_loss: 6.4690 - val_mse: 6.4690 - 30ms/epoch - 10ms/step
Epoch 80/200
3/3 - 0s - loss: 1.7347 - mse: 1.7347 - val_loss: 6.4773 - val_mse: 6.4773 - 30ms/epoch - 10ms/step
Epoch 81/200
3/3 - 0s - loss: 1.8892 - mse: 1.8892 - val_loss: 6.5323 - val_mse: 6.5323 - 29ms/epoch - 10ms/step
Epoch 82/200
3/3 - 0s - loss: 2.0465 - mse: 2.0465 - val_loss: 6.6022 - val_mse: 6.6022 - 29ms/epoch - 10ms/step
Epoch 83/200
3/3 - 0s - loss: 1.8938 - mse: 1.8938 - val_loss: 6.4226 - val_mse: 6.4226 - 28ms/epoch - 9ms/step
Epoch 84/200
3/3 - 0s - loss: 1.7882 - mse: 1.7882 - val_loss: 6.3447 - val_mse: 6.3447 - 31ms/epoch - 10ms/step
Epoch 85/200
3/3 - 0s - loss: 1.8548 - mse: 1.8548 - val_loss: 6.4161 - val_mse: 6.4161 - 29ms/epoch - 10ms/step
Epoch 86/200
3/3 - 0s - loss: 1.7737 - mse: 1.7737 - val_loss: 6.4914 - val_mse: 6.4914 - 28ms/epoch - 9ms/step
Epoch 87/200
3/3 - 0s - loss: 1.7536 - mse: 1.7536 - val_loss: 6.5369 - val_mse: 6.5369 - 28ms/epo

3/3 - 0s - loss: 1.5015 - mse: 1.5015 - val_loss: 6.6744 - val_mse: 6.6744 - 30ms/epoch - 10ms/step
Epoch 152/200
3/3 - 0s - loss: 1.7032 - mse: 1.7032 - val_loss: 6.6407 - val_mse: 6.6407 - 30ms/epoch - 10ms/step
Epoch 153/200
3/3 - 0s - loss: 1.2373 - mse: 1.2373 - val_loss: 6.6577 - val_mse: 6.6577 - 28ms/epoch - 9ms/step
Epoch 154/200
3/3 - 0s - loss: 1.9951 - mse: 1.9951 - val_loss: 6.4834 - val_mse: 6.4834 - 29ms/epoch - 10ms/step
Epoch 155/200
3/3 - 0s - loss: 1.2548 - mse: 1.2548 - val_loss: 6.3710 - val_mse: 6.3710 - 30ms/epoch - 10ms/step
Epoch 156/200
3/3 - 0s - loss: 1.4765 - mse: 1.4765 - val_loss: 6.3092 - val_mse: 6.3092 - 29ms/epoch - 10ms/step
Epoch 157/200
3/3 - 0s - loss: 1.3147 - mse: 1.3147 - val_loss: 6.4038 - val_mse: 6.4038 - 31ms/epoch - 10ms/step
Epoch 158/200
3/3 - 0s - loss: 1.5428 - mse: 1.5428 - val_loss: 6.5202 - val_mse: 6.5202 - 29ms/epoch - 10ms/step
Epoch 159/200
3/3 - 0s - loss: 1.3155 - mse: 1.3155 - val_loss: 6.6138 - val_mse: 6.6138 - 30ms/epoch -

2022-07-23 14:27:26.003280: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### NN 2

In [40]:
# Select number of hidden units
layer_1_units = 64
layer_2_units = 128
# Select whether to batch normalize
batch_normalization = False

# Build and compile model
model = build_mlp(X_train,
                  layer_1_units = layer_1_units,
                  layer_2_units = layer_2_units,
                  batch_normalization = batch_normalization,
                  **compile_hp)
# Fit model
history = model.fit(X_train, y_train, **fit_hp)

# Evaluate model, getting test and train RMSE
results = evaluate_model(X_train, X_test,
                         y_train, y_test,
                         model, "neural_network-64_128")
# Store results
all_results = pd.concat([all_results, results])

Epoch 1/200


2022-07-23 14:17:30.912410: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


3/3 - 0s - loss: 17.8342 - mse: 17.8342 - val_loss: 15.7452 - val_mse: 15.7452 - 473ms/epoch - 158ms/step
Epoch 2/200
3/3 - 0s - loss: 8.5652 - mse: 8.5652 - val_loss: 8.9011 - val_mse: 8.9011 - 26ms/epoch - 9ms/step
Epoch 3/200
3/3 - 0s - loss: 7.1895 - mse: 7.1895 - val_loss: 9.5617 - val_mse: 9.5617 - 32ms/epoch - 11ms/step
Epoch 4/200
3/3 - 0s - loss: 8.4689 - mse: 8.4689 - val_loss: 7.9956 - val_mse: 7.9956 - 29ms/epoch - 10ms/step
Epoch 5/200
3/3 - 0s - loss: 6.6286 - mse: 6.6286 - val_loss: 7.1548 - val_mse: 7.1548 - 31ms/epoch - 10ms/step
Epoch 6/200
3/3 - 0s - loss: 4.9607 - mse: 4.9607 - val_loss: 7.5189 - val_mse: 7.5189 - 27ms/epoch - 9ms/step
Epoch 7/200


2022-07-23 14:17:31.221679: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


3/3 - 0s - loss: 4.7402 - mse: 4.7402 - val_loss: 7.7494 - val_mse: 7.7494 - 25ms/epoch - 8ms/step
Epoch 8/200
3/3 - 0s - loss: 4.5325 - mse: 4.5325 - val_loss: 7.2399 - val_mse: 7.2399 - 27ms/epoch - 9ms/step
Epoch 9/200
3/3 - 0s - loss: 4.2819 - mse: 4.2819 - val_loss: 6.5928 - val_mse: 6.5928 - 28ms/epoch - 9ms/step
Epoch 10/200
3/3 - 0s - loss: 4.2913 - mse: 4.2913 - val_loss: 6.4684 - val_mse: 6.4684 - 27ms/epoch - 9ms/step
Epoch 11/200
3/3 - 0s - loss: 4.0944 - mse: 4.0944 - val_loss: 6.7066 - val_mse: 6.7066 - 25ms/epoch - 8ms/step
Epoch 12/200
3/3 - 0s - loss: 3.9838 - mse: 3.9838 - val_loss: 6.9439 - val_mse: 6.9439 - 27ms/epoch - 9ms/step
Epoch 13/200
3/3 - 0s - loss: 3.9000 - mse: 3.9000 - val_loss: 6.5091 - val_mse: 6.5091 - 26ms/epoch - 9ms/step
Epoch 14/200
3/3 - 0s - loss: 3.8281 - mse: 3.8281 - val_loss: 6.1909 - val_mse: 6.1909 - 27ms/epoch - 9ms/step
Epoch 15/200
3/3 - 0s - loss: 3.7931 - mse: 3.7931 - val_loss: 6.1722 - val_mse: 6.1722 - 29ms/epoch - 10ms/step
Epoch 

2022-07-23 14:17:32.398771: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### NN 3

In [42]:
# Select number of hidden units
layer_1_units = 128
layer_2_units = 128
# Select whether to batch normalize
batch_normalization = False

# Build and compile model
model = build_mlp(X_train,
                  layer_1_units = layer_1_units,
                  layer_2_units = layer_2_units,
                  batch_normalization = batch_normalization,
                  **compile_hp)
# Fit model
history = model.fit(X_train, y_train, **fit_hp)

# Evaluate model, getting test and train RMSE
results = evaluate_model(X_train, X_test,
                         y_train, y_test,
                         model, "neural_network-128_128")
# Store results
all_results = pd.concat([all_results, results])

Epoch 1/200
3/3 - 0s - loss: 23.8950 - mse: 23.8950 - val_loss: 10.8442 - val_mse: 10.8442 - 304ms/epoch - 101ms/step
Epoch 2/200


2022-07-23 14:18:02.003547: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-23 14:18:02.156757: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


3/3 - 0s - loss: 8.5613 - mse: 8.5613 - val_loss: 12.7788 - val_mse: 12.7788 - 26ms/epoch - 9ms/step
Epoch 3/200
3/3 - 0s - loss: 11.5893 - mse: 11.5893 - val_loss: 8.3671 - val_mse: 8.3671 - 30ms/epoch - 10ms/step
Epoch 4/200
3/3 - 0s - loss: 6.5829 - mse: 6.5829 - val_loss: 8.6595 - val_mse: 8.6595 - 28ms/epoch - 9ms/step
Epoch 5/200
3/3 - 0s - loss: 5.6135 - mse: 5.6135 - val_loss: 9.5364 - val_mse: 9.5364 - 27ms/epoch - 9ms/step
Epoch 6/200
3/3 - 0s - loss: 5.4670 - mse: 5.4670 - val_loss: 8.0188 - val_mse: 8.0188 - 33ms/epoch - 11ms/step
Epoch 7/200
3/3 - 0s - loss: 4.3561 - mse: 4.3561 - val_loss: 6.2787 - val_mse: 6.2787 - 27ms/epoch - 9ms/step
Epoch 8/200
3/3 - 0s - loss: 5.0913 - mse: 5.0913 - val_loss: 6.0576 - val_mse: 6.0576 - 29ms/epoch - 10ms/step
Epoch 9/200
3/3 - 0s - loss: 4.5439 - mse: 4.5439 - val_loss: 6.4141 - val_mse: 6.4141 - 27ms/epoch - 9ms/step
Epoch 10/200
3/3 - 0s - loss: 3.8786 - mse: 3.8786 - val_loss: 7.2833 - val_mse: 7.2833 - 25ms/epoch - 8ms/step
Epoch

2022-07-23 14:18:03.248310: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### NN 4

In [56]:
# Select number of hidden units
layer_1_units = 32
layer_2_units = 32
# Select whether to batch normalize
batch_normalization = False,

# Build and compile model
model = build_mlp(X_train,
                  layer_1_units = layer_1_units,
                  layer_2_units = layer_2_units,
                  batch_normalization = batch_normalization,
                  **compile_hp)
# Fit model
history = model.fit(X_train, y_train, **fit_hp)

# Evaluate model, getting test and train RMSE
results = evaluate_model(X_train, X_test,
                         y_train, y_test,
                         model, "neural_network-32_32")
# Store results
all_results = pd.concat([all_results, results])

Epoch 1/200
3/3 - 0s - loss: 15.8271 - mse: 15.8271 - val_loss: 14.0402 - val_mse: 14.0402 - 300ms/epoch - 100ms/step
Epoch 2/200


2022-07-23 14:21:10.766739: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-23 14:21:10.909815: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


3/3 - 0s - loss: 9.4405 - mse: 9.4405 - val_loss: 11.3053 - val_mse: 11.3053 - 32ms/epoch - 11ms/step
Epoch 3/200
3/3 - 0s - loss: 9.7488 - mse: 9.7488 - val_loss: 11.2969 - val_mse: 11.2969 - 30ms/epoch - 10ms/step
Epoch 4/200
3/3 - 0s - loss: 9.4060 - mse: 9.4060 - val_loss: 9.5024 - val_mse: 9.5024 - 30ms/epoch - 10ms/step
Epoch 5/200
3/3 - 0s - loss: 7.1661 - mse: 7.1661 - val_loss: 9.2853 - val_mse: 9.2853 - 30ms/epoch - 10ms/step
Epoch 6/200
3/3 - 0s - loss: 6.1443 - mse: 6.1443 - val_loss: 10.1248 - val_mse: 10.1248 - 27ms/epoch - 9ms/step
Epoch 7/200
3/3 - 0s - loss: 6.4616 - mse: 6.4616 - val_loss: 10.2217 - val_mse: 10.2217 - 26ms/epoch - 9ms/step
Epoch 8/200
3/3 - 0s - loss: 6.0023 - mse: 6.0023 - val_loss: 8.3824 - val_mse: 8.3824 - 34ms/epoch - 11ms/step
Epoch 9/200
3/3 - 0s - loss: 5.0441 - mse: 5.0441 - val_loss: 6.8960 - val_mse: 6.8960 - 29ms/epoch - 10ms/step
Epoch 10/200
3/3 - 0s - loss: 5.0697 - mse: 5.0697 - val_loss: 6.5076 - val_mse: 6.5076 - 30ms/epoch - 10ms/st

2022-07-23 14:21:12.995092: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [66]:
all_results.sort_values("test_rmse")

Unnamed: 0,model,train_rmse,test_rmse
0,neural_network,2.043704,1.813406
0,neural_network-64_64,2.065874,1.82801
0,neural_network-64_64_init,2.096282,1.831205
0,neural_network-64_128,2.058027,1.868867
0,linear,1.969422,1.888512
0,neural_network-64_64,2.011368,1.888757
0,neural_network-32_64,2.021202,1.914971
0,neural_network-128_128,2.032535,1.917668
0,neural_network-32_32,2.019506,2.041065
0,neural_network-32_32,1.998198,2.048241


## LGBM

In [98]:
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback

In [208]:
# Params to compile model with
fixed_params = {
        'objective': 'regression',
        'metric': "rmse",  
        'verbosity': -1,
}

In [216]:
def objective(trial, X, y):
    """
    Wrapper function to work with Optuna trial objects, 
    enabling Hyperband hyperparameter search.
    """   
    # Suggest hyperparams to test using Optuna trial object.
    param = {**fixed_params,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 3000, step = 20),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.99, step = 0.05),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.99, step = 0.05),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        "n_estimators": trial.suggest_int("n_estimators", 200, 5000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 2000, step=5),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 10),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
    }
    
    # Create cv object
    cv = StratifiedKFold(n_splits = 5, shuffle = True)
    # Make empty array to store cv f1 scores in
    cv_scores = np.empty(5)
    
    # Split into K train and validation sets and iterate through them
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        # Split into training and validation CV sets
        X_train_cv, X_test_cv = X[train_idx], X[test_idx]
        y_train_cv, y_test_cv = y[train_idx], y[test_idx]

        # Convert data to proper LGBM format
        train_data = lgb.Dataset(X_train_cv, label = y_train_cv,
                                 categorical_feature = [0,1,2,3,4,6,7,8])
        val_data = lgb.Dataset(X_test_cv, label = y_test_cv, 
                               categorical_feature = [0,1,2,3,4,6,7,8],
                              reference = train_data)
        
        # Make callbacks to prevent trialling hyperparams that are obviously bad
        callbacks = [
            LightGBMPruningCallback(trial, metric = "rmse"),
                     # Callback to reduce model validation performance messages
                    lgb.log_evaluation(period = 100),
                     # Early stoppping to prevent overfitting training data
                    lgb.early_stopping(100)]

        # Training the model
        model = lgb.train(params = param,  train_set = train_data,
                          valid_sets = val_data,   
                          callbacks = callbacks,
                         )
    
        
        # Get predictions
        preds = model.predict(X_test_cv)
        # Calculate RMSE
        cv_scores[idx] = mean_squared_error(y_test_cv, preds, squared = False)

    return np.mean(cv_scores)

In [224]:
%%capture my_study
# Above line magic hides lengthy output, but stores into first_round if you want to look

# Create Optuna study to do CV hyperparameter search
study = optuna.create_study(direction = "minimize", # minimizing RMSE
                            study_name = "LGBM Classifier",
                           pruner=optuna.pruners.HyperbandPruner())
func = lambda trial: objective(trial, X = X_train, y = y_train)
study.optimize(func, n_trials=1000)

In [225]:
study.best_value

2.0306908927013265

## Support Vector Regression

### RBF SVR

In [86]:
# Create model
svr = SVR()
# Fit
svr.fit(X_train, y_train)
# Evaluate model, getting test and train RMSE
results = evaluate_model(X_train, X_test,
                         y_train, y_test,
                         svr, "svm_rbf")
# Store results
all_results = pd.concat([all_results, results])
all_results.sort_values("test_rmse")

### LinearSVR

In [97]:
# Create model
epsilon = 0.499
svr = LinearSVR(epsilon = epsilon)
# Fit
svr.fit(X_train, y_train)
# Evaluate model, getting test and train RMSE
results = evaluate_model(X_train, X_test,
                         y_train, y_test,
                         svr, f"svm_linear-{epsilon}")
# Store results
all_results = pd.concat([all_results, results])
all_results.sort_values("test_rmse")



Unnamed: 0,model,train_rmse,test_rmse
0,svm_rbf,2.044915,1.732313
0,neural_network,2.043704,1.813406
0,neural_network-64_64,2.065874,1.82801
0,neural_network-64_64_init,2.096282,1.831205
0,neural_network-64_128,2.058027,1.868867
0,linear,1.969422,1.888512
0,svm_rbf,1.969422,1.888512
0,linear,1.969422,1.888512
0,neural_network-64_64,2.011368,1.888757
0,neural_network-32_64,2.021202,1.914971
