In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.inspection import permutation_importance
from notebook_helper import MyCorpus, build_hybrid_model, build_callbacks, build_embedding_matrix, iteration_features, show_stats
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from string import ascii_letters
import time
import os

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa

# Import necessary modules
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit

# Keras specific

#### CHANGED from import keras:
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Sequential
#####
from keras.layers import Dense, LSTM, Embedding, Flatten, CuDNNLSTM, Bidirectional, Dropout


# from keras.utils import to_categorical

# Gemsim
import gensim.models
from gensim import utils

from numpy import array
from numpy import asarray
from numpy import zeros


from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score

# from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
try:
    print("Before:\n" ,tf.config.get_visible_devices('GPU'))
    gpus = tf.config.list_physical_devices('GPU')
    tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
    print("After:\n" ,tf.config.get_visible_devices('GPU'))
except IndexError as e:
    pass

Before:
 []


In [3]:
# Read the data
min_val = 50
repo_name = f"174_min{min_val}_alt_siblings_zhenhao.csv"
# repo_name = f"web2py_zhenhao.csv"
df = pd.read_csv('../features/'+ repo_name)

# Remove errors
df = df[df.parent != 'b']
df = df[df.type != 'b']

# Onehot encode type and parent
df = pd.get_dummies(df, columns=["type", "parent"])

show_stats(df)
df.head()

Shape: (17895, 29)
Number of parameter vecs:		17895
without logging (negatives):	17572
with logging (positives):		323
Log ratio:						1.80%
   type  count  positives     ratio
9     m    767         69  0.089961
4     h    185          4  0.021622
2     e   6970        135  0.019369
8     l   1723         32  0.018572
5     i    872         12  0.013761
6     j    166          2  0.012048
1     d   4593         53  0.011539
7     k   1233         14  0.011354
3     f   1190          2  0.001681
0     c     20          0  0.000000
10    o    176          0  0.000000


Unnamed: 0,location,length,num_siblings,num_children,depth_from_def,context,contains_logging,type_c,type_d,type_e,...,parent_d,parent_e,parent_f,parent_h,parent_i,parent_j,parent_k,parent_l,parent_m,parent_o
0,15;4-16;50,2,12,2,0,durr,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,19;4-60;18,42,12,13,0,deqrrerueruferuerqrrqrqrqrfrerukruu,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,26;8-26;57,1,13,1,2,deqrr,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,30;8-30;22,1,13,1,2,deqrreru,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,34;8-34;22,1,13,1,2,deqrrerueru,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [4]:
# Convert the compacted context from letters into strings of integers
df.context = [list(map(lambda y: str(ascii_letters.index(y)), list(str(x)))) for x in df.context]
df.head()

Unnamed: 0,location,length,num_siblings,num_children,depth_from_def,context,contains_logging,type_c,type_d,type_e,...,parent_d,parent_e,parent_f,parent_h,parent_i,parent_j,parent_k,parent_l,parent_m,parent_o
0,15;4-16;50,2,12,2,0,"[3, 20, 17, 17]",0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,19;4-60;18,42,12,13,0,"[3, 4, 16, 17, 17, 4, 17, 20, 4, 17, 20, 5, 4,...",0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,26;8-26;57,1,13,1,2,"[3, 4, 16, 17, 17]",0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,30;8-30;22,1,13,1,2,"[3, 4, 16, 17, 17, 4, 17, 20]",0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,34;8-34;22,1,13,1,2,"[3, 4, 16, 17, 17, 4, 17, 20, 4, 17, 20]",0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [5]:
# Define X and y
X = df.drop(["location", "contains_logging"], axis=1)
y = df.contains_logging
# Keep holdout set for testing after k-fold cross validation
X, X_holdout, y, y_holdout = train_test_split(X, y, test_size = 0.1, stratify=y, random_state=0)

In [6]:
# Word2Vec Model
sentences = MyCorpus(list(df.context))
gensim_model = gensim.models.Word2Vec(sentences=sentences, min_count=1)
actual_vocab_size = len(gensim_model.wv.key_to_index)
actual_vocab_size

26

In [15]:
# sampling_strategy = 0.05
sampling_strategy = 0.05
vocab_size = actual_vocab_size + 1
output_dims = 100
max_length = 80
num_epochs = 20
batch_size = 64
trainable=True
dropout = 0.2
val_split = 0.0
num_nodes = 128
callback = ["cp"]
callback_monitor = 'val_f1_score'
class_weight = "class_weight_unsupported"
cmpltn_metrics = [tfa.metrics.F1Score(num_classes=1, threshold=0.5)]

# Cross-validation settings
n_splits = 1

In [16]:
# Build embedding matrix
embedding_matrix = build_embedding_matrix(vocab_size, output_dims, gensim_model)

In [17]:
# Prepare holdout test sets
padded_inputs_holdout = pad_sequences(np.array(list(X_holdout.context), dtype=object), maxlen=max_length, value=0.0)
regular_inputs_holdout = X_holdout.drop(["context"], axis=1)
X_holdout_dict = {"context": padded_inputs_holdout, "other": regular_inputs_holdout}

In [None]:
run_name = f"{repo_name}_cv{n_splits}"
run_number = 1
if not os.getcwd().endswith("notebooks"):
    raise Exception("Bad working directory")
while os.path.isdir(f"my_zhenhao_models/{repo_name}/run{run_number}/"):
    run_number += 1
run_folder = f"run{run_number}"

# DEBUG
debug = False
if debug:
    num_epochs = 1
    batch_size = 256
    n_splits = 1
    run_folder = abs(hash(str(time.ctime())))
    run_name = f"DEBUG_{run_name}"
# /DEBUG

start = time.time()
histories = []
test_sets = []

model = build_hybrid_model(vocab_size, output_dims, embedding_matrix, max_length,
                       trainable, num_nodes, dropout, X.shape[1] - 1)
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=cmpltn_metrics)

final_bal_acc_all, final_precision_all, final_recall_all, final_f1_all = [], [], [], []
best_bal_acc_all, best_precision_all, best_recall_all, best_f1_all = [], [], [], []
# K-fold cross-validation
if n_splits == 1:
    indices = np.arange(y.shape[0])
    strat_train_idx, strat_val_idx = train_test_split(indices, test_size=0.25, stratify=y, random_state=0)
    idx_iter = [(strat_train_idx, strat_val_idx)]
else:
    skf = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=0)
    idx_iter = skf.split(X, y)
for k_fold, (train_index, test_index) in enumerate(idx_iter):
    print(f"Starting fold {k_fold + 1} of {n_splits}.")
    prog_log = open("progess_zhenhao.log", "a")
    prog_log.write(f"{time.ctime()} Starting fold {k_fold + 1} of {n_splits}. Run folder: {run_folder}\n")
    prog_log.close()
    # Split the data into train and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # Oversample the minority class
    sampler = RandomOverSampler(sampling_strategy=sampling_strategy)
    X_train, y_train = sampler.fit_resample(X_train, y_train)
    # Pad the context to create the context input
    padded_inputs = pad_sequences(np.array(list(X_train.context), dtype=object), maxlen=max_length, value=0.0)
    padded_inputs_test = pad_sequences(np.array(list(X_test.context), dtype=object), maxlen=max_length, value=0.0)
    # Prepare the "other" input
    regular_inputs = X_train.drop(["context"], axis=1)
    regular_inputs_test = X_test.drop(["context"], axis=1)
    # Put both inputs into a dict
    X_train_dict = {"context": padded_inputs, "other": regular_inputs}
    X_test_dict = {"context": padded_inputs_test, "other": regular_inputs_test}
    # Append to the list of test sets
    test_sets.append((X_test_dict, y_test))
    # Build the callbacks
    callbacks, model_cp_filepath = build_callbacks(callback, callback_monitor, repo_name, run_folder, k_fold, zhenhao=True)
    # Fit the model
    history = model.fit(
        X_train_dict,
        {"logging": y_train},
        epochs=num_epochs,
        batch_size=batch_size,
        validation_data=(X_test_dict, y_test),
        validation_split=val_split,
        callbacks=callbacks,
    )
    histories.append(history)

    # Predict with final weights
    pred_test= model.predict(X_test_dict, batch_size=batch_size)
    y_pred = np.round(pred_test)
    final_bal_acc_all.append(balanced_accuracy_score(y_test, y_pred))
    final_precision_all.append(precision_score(y_test, y_pred))
    final_recall_all.append(recall_score(y_test, y_pred))
    final_f1_all.append(f1_score(y_test, y_pred))
    # Now load the best weights and predict on test data again
    if "cp" in callback:
            model.load_weights(model_cp_filepath)
            best_pred_test= model.predict(X_test_dict, batch_size=batch_size)
            best_y_pred = np.round(best_pred_test)
            best_bal_acc_all.append(balanced_accuracy_score(y_test, best_y_pred))
            best_precision_all.append(precision_score(y_test, best_y_pred))
            best_recall_all.append(recall_score(y_test, best_y_pred))
            best_f1_all.append(f1_score(y_test, best_y_pred))

# Determine best fold and predict on holdout set
best_fold = np.argmax(best_f1_all)
best_fold_filepath = f'my_zhenhao_models/{repo_name}/{run_folder}/fold{best_fold}'
model.load_weights(best_fold_filepath)
pred_holdout= model.predict(X_holdout_dict, batch_size=batch_size)
y_pred_holdout = np.round(pred_holdout)

end = time.time()
execution_time = int(end - start)

scores = [
    run_name,
    time.ctime(),
    sampling_strategy,
    max_length,
    vocab_size,
    batch_size,
    trainable,
    dropout,
    val_split,
    callback,
    callback_monitor,
    num_nodes,
    num_epochs,
    class_weight,
    list(map(lambda x: x.name if callable(x) else x, cmpltn_metrics)),
    run_folder,
    execution_time,
    f"{np.mean(final_bal_acc_all, axis=0):.2f}"[2:],
    f"{np.mean(final_precision_all, axis=0):.2f}"[2:],
    f"{np.mean(final_recall_all, axis=0):.2f}"[2:],
    f"{np.mean(final_f1_all, axis=0):.3f}"[2:],
    f"{np.mean(best_bal_acc_all, axis=0):.2f}"[2:],
    f"{np.mean(best_precision_all, axis=0):.2f}"[2:],
    f"{np.mean(best_recall_all, axis=0):.2f}"[2:],
    f"{np.mean(best_f1_all, axis=0):.3f}"[2:],
    best_fold,
    f"{best_f1_all[best_fold]:.3f}"[2:],
    f"{balanced_accuracy_score(y_holdout, y_pred_holdout):.2f}"[2:],
    f"{precision_score(y_holdout, y_pred_holdout):.2f}"[2:],
    f"{recall_score(y_holdout, y_pred_holdout):.2f}"[2:],
    f"{f1_score(y_holdout, y_pred_holdout):.3f}"[2:],
]
out = open("results_zhenhao.txt", "a")
# out.write(iteration_features + ", Final_Bal_Acc, Final_Prec, Final_Recall, Final_F1, Best_Bal_Acc, Best_Prec, Best_Recall, Best_F1, Best_Fold, Best_Fold_F1, Best_Fold_Holdout_Bal_Acc, Best_Fold_Holdout_Prec, Best_Fold_Holdout_Recall, Best_Fold_Holdout_F1 \n")
out.write(str(scores).replace("'", "")[1:-1] + "\n")
out.close()

prog_log = open("progess_zhenhao.log", "a")
prog_log.write(f"{time.ctime()} Finished fold {n_splits}. Run folder: {run_folder}\n")
prog_log.close()