In [1]:
from string import ascii_letters
import time
from notebook_helper import MyCorpus, build_model, build_callbacks, build_embedding_matrix

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
import tensorflow_addons as tfa

# Import necessary modules
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt

# Keras specific

#### CHANGED from import keras:
import tensorflow.keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential
#####
from keras.layers import Dense, LSTM, Embedding, Flatten, CuDNNLSTM, Bidirectional, Dropout


# from keras.utils import to_categorical

# Gemsim
import gensim.models
from gensim import utils

from numpy import array
from numpy import asarray
from numpy import zeros


from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score

# from tensorflow.keras.datasets import imdb
# from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from platform import python_version

print(python_version())

3.7.3


In [3]:
print("Before:\n" ,tf.config.get_visible_devices('GPU'))
gpus = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
except IndexError as e:
    pass
print("After:\n" ,tf.config.get_visible_devices('GPU'))

Before:
 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
After:
 [PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [None]:
# Read the data
min_val = 50
repo_name = f"174repos_min{min_val}_max1000000_zhenhao"
# repo_name = f"300repos_min{min_val}_max1000000_zhenhao"
# repo_name = f"combination_zhenhao"
df = pd.read_csv('../features/'+ repo_name +'.csv')

# Remove errors
df = df[df.type != 'b']

no_log_cnt, log_cnt = df['contains_logging'].value_counts()
par_vec_cnt = no_log_cnt + log_cnt
log_ratio = log_cnt / par_vec_cnt
print(f"Number of parameter vecs:\t\t{par_vec_cnt}")
print(f"without logging (negatives):\t{no_log_cnt}")
print(f"with logging (positives):\t\t{log_cnt}")
print(f"Log ratio:\t\t\t\t\t\t{log_ratio * 100:.2f}%")
print(df.shape)
df.head()

In [None]:
# Split data into train and test sets
X = df.context
# Split the context string into list of characters,
# then replace the characters with their index in the alphabet (a-zA-Z) as strings
# E.g. 'cad' -> ['2','0','3']
X = [list(map(lambda y: str(ascii_letters.index(y)), list(x))) for x in X]
# Build the Word2Vec Model
sentences = MyCorpus(X)
gensim_model = gensim.models.Word2Vec(sentences=sentences, min_count=1)
actual_vocab_size = len(gensim_model.wv.key_to_index)

y = df.contains_logging

# Default values
default_output_dims = 100
default_max_length = 80
default_vocab_size = actual_vocab_size + 1
default_batch_size = 24
default_trainable = False
default_dropout = 0.2
default_val_split = 0.0
default_callback = ["cp"]
default_callback_monitor = 'val_f1_score'
default_num_nodes = 128
default_num_epochs = 100
default_class_weight = {0: 1.0, 1: 5.0}
default_cmpltn_metrics = [tfa.metrics.F1Score(num_classes=1, threshold=0.5)]

# Cross-validation settings
n_splits = 3

# Build embedding matrix
embedding_matrix = build_embedding_matrix(default_vocab_size, default_output_dims, gensim_model)

# Pad the context
X_unpadded = np.array(X, dtype=object)
X = pad_sequences(X_unpadded, maxlen=default_max_length, value=0.0)

In [None]:
iteration_features =  "Name, max_length, vocab_size, batch_size, trainable, dropout, val_split, callback, callback_monitor, num_nodes, num_epochs, class_weight, cmpltn_metrics"
iterations = [
# B_ is best but Z_ is faster and what Zhenhao used
# Test model.fit(validation_data=(padded_inputs_test, y_test)) and callback_monitor='val_f1_score'
        (f'Z_{repo_name}_cv{n_splits}', 80, actual_vocab_size + 1, 32, True, 0.2, 0.0, ["es", "cp"], 'val_f1_score', 128, 20, {0: 1.0, 1: 5.0}, default_cmpltn_metrics),
# TODO: Do Crossvalidation stratified shuffled fold testing with high batch size to compensate

]

# Todo: Batch size, output dims, load_best_weights?
# Todo: Add callback_patience
# Todo: Transform into dict

all_scores = []
len(iterations)

In [None]:
out = open("results.txt", "a")
out.write(iteration_features + ", settings_hash, execution_time, Final_Bal_Acc, Final_Prec, Final_Recall, Final_F1, Best_Bal_Acc, Best_Prec, Best_Recall, Best_F1")
# out.write(str(iterations[0]))
out.write("\n")
out.close()

In [None]:
for iteration in iterations:
    name, max_length, vocab_size, batch_size, trainable, dropout, val_split, callback, callback_monitor, num_nodes, num_epochs, class_weight, cmpltn_metrics = iteration
    print(name)
    settings_hash = int((hash(str(iteration)) ** 2) ** 0.5)
    start = time.time()

    # Rebuild embedding matrix in case of changed vocab_size (missing changed output_dims)
    if vocab_size != embedding_matrix.shape[0]:
        embedding_matrix = build_embedding_matrix(vocab_size, default_output_dims, gensim_model)

     # Pad the context for different max_length
    if max_length != X.shape[1]:
        X = pad_sequences(X_unpadded, maxlen=max_length, value=0.0)

    final_bal_acc_all, final_precision_all, final_recall_all, final_f1_all = [], [], [], []
    best_bal_acc_all, best_precision_all, best_recall_all, best_f1_all = [], [], [], []
    skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)

    for k_fold, (train_indices, test_indices) in enumerate(skf.split(X=X, y=y)):
        print(f"Starting fold {k_fold + 1} of {n_splits}.")
        X_train, y_train = X[train_indices], y.iloc[train_indices]
        X_test, y_test = X[test_indices], y.iloc[test_indices]

        # Build the model
        model = build_model(name, vocab_size, default_output_dims, embedding_matrix, max_length, trainable, num_nodes, dropout)

        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=cmpltn_metrics)

        # Build the callbacks
        callbacks, model_cp_filepath = build_callbacks(callback, callback_monitor, repo_name, settings_hash, k_fold)

        # Fit the model
        history = model.fit(X_train,
                            y_train,
                            epochs=num_epochs,
                            batch_size=batch_size,
                            validation_data=(X_test, y_test),
                            validation_split=val_split,
                            callbacks=callbacks,
                            class_weight=class_weight)

        # Predict on test data
        pred_test= model.predict(X_test, batch_size=batch_size)
        y_pred = np.round(pred_test)
        final_bal_acc_all.append(balanced_accuracy_score(y_test, y_pred))
        final_precision_all.append(precision_score(y_test, y_pred))
        final_recall_all.append(recall_score(y_test, y_pred))
        final_f1_all.append(f1_score(y_test, y_pred))

        if "cp" in callback:
            # Now load the best weights and predict on test data again
            model.load_weights(model_cp_filepath)
            best_pred_test= model.predict(X_test, batch_size=batch_size)
            best_y_pred = np.round(best_pred_test)
            best_bal_acc_all.append(balanced_accuracy_score(y_test, best_y_pred))
            best_precision_all.append(precision_score(y_test, best_y_pred))
            best_recall_all.append(recall_score(y_test, best_y_pred))
            best_f1_all.append(f1_score(y_test, best_y_pred))

    end = time.time()
    execution_time = int(end - start)

    # Scores
    scores = [
        name,
        max_length,
        vocab_size,
        batch_size,
        trainable,
        dropout,
        val_split,
        callback,
        callback_monitor,
        num_nodes,
        num_epochs,
        class_weight,
        list(map(lambda x: x.name if callable(x) else x, cmpltn_metrics)),
        settings_hash,
        execution_time,
        f"{np.mean(final_bal_acc_all, axis=0):.2f}"[2:],
        f"{np.mean(final_precision_all, axis=0):.2f}"[2:],
        f"{np.mean(final_recall_all, axis=0):.2f}"[2:],
        f"{np.mean(final_f1_all, axis=0):.3f}"[2:],
        f"{np.mean(best_bal_acc_all, axis=0):.2f}"[2:],
        f"{np.mean(best_precision_all, axis=0):.2f}"[2:],
        f"{np.mean(best_recall_all, axis=0):.2f}"[2:],
        f"{np.mean(best_f1_all, axis=0):.3f}"[2:],
    ]

    out = open("results.txt", "a")
    out.write(str(scores).replace("'", "")[1:-1])
    out.write("\n")
    out.close()

In [None]:
vars(history)