<a href="https://colab.research.google.com/github/nabeel-gulzar/CodeCloneDetection/blob/main/DeepSim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !git clone https://github.com/parasol-aser/deepsim.git

Cloning into 'deepsim'...
remote: Enumerating objects: 75, done.[K
remote: Total 75 (delta 0), reused 0 (delta 0), pack-reused 75[K
Unpacking objects: 100% (75/75), done.


In [1]:
!nvidia-smi

Wed Mar 16 11:15:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
import time
from functools import reduce

In [None]:
tf.__version__

'2.8.0'

In [None]:
# !pip install --upgrade tf_slim



In [None]:
import tf_slim as slim

In [None]:
# !chmod 755 deepsim/dcsim/encoding/encoding.jar

In [None]:
# os.path.exists("deepsim/dcsim/encoding/encoding.jar")
# !encoding.jar "hi"
# !deepsim/dcsim/encoding/src/encoding.jar

In [None]:
bin_vec_dim = 88
embedding_dim = 6
dim = 128
keep_prob = 0.75

batch_size = 256
test_size = 256

beta = 0.00003
# beta = 0.00001 # for model with batch normalization
reg_term = None

logdir = '/tmp/tf_logs'

In [None]:
def from_sparse_arr(sparse_arr):
    mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32)
    for (i, j, k) in sparse_arr:
        mat[i, j, k] = 1
    return mat

def from_sparse_arrs(sparse_arrs):
    mats = []
    for sparse_arr in sparse_arrs:
        mats.append(from_sparse_arr(sparse_arr))
    mats = np.array(mats, dtype=np.float32)
    return mats

In [None]:
# file_path = "/content/deepsim/dataset/g4_128.npy"
# dataset = np.load(open(file_path, 'rb'), allow_pickle=True)
# X, y = np.array(dataset['X']), np.array(dataset['y'], dtype='int')

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
def batch_act(h, act, phase, scope):
    with tf.compat.v1.variable_scope(scope):
        return act(h)


def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)

In [None]:
def model(X, dropout, phase):
    global reg_term
    num = tf.shape(X)[0]
    with tf.name_scope('emb_layer'):
        wf = init_weights([bin_vec_dim, embedding_dim], 'wf')
        reg_term = tf.nn.l2_loss(wf)
        variable_summaries(wf)
        bf = init_bias([embedding_dim], 'bf')
        variable_summaries(bf)
        X = tf.reshape(X, [num * dim * dim, bin_vec_dim])
        h0 = tf.nn.bias_add(tf.matmul(X, wf), bf)
        h0 = batch_act(h0, phase=phase, act=tf.nn.elu, scope='emb_layer_bn')
        h0 = tf.reshape(h0, [num * dim, dim * embedding_dim])
        h0 = tf.nn.dropout(h0, dropout)
    with tf.name_scope('row_fc_layer1'):
        wr1 = init_weights([embedding_dim * dim, 256], 'wr1')  # 128
        reg_term += tf.nn.l2_loss(wr1)
        br1 = init_bias([256], 'br1')
        h1 = tf.nn.bias_add(tf.matmul(h0, wr1), br1)
        h1 = batch_act(h1, phase=phase, act=tf.nn.elu, scope='row_fc_layer1_bn')
        h1 = tf.nn.dropout(h1, dropout)
    with tf.name_scope('row_fc_layer2'):
        wr2 = init_weights([256, 64], 'wr2')  # 32
        reg_term += tf.nn.l2_loss(wr2)
        br2 = init_bias([64], 'br2')
        h2 = tf.nn.bias_add(tf.matmul(h1, wr2), br2)
        h2 = batch_act(h2, phase=phase, act=tf.nn.elu, scope='row_fc_layer2_bn')
        h2 = tf.reshape(h2, [num, dim, 64])  # 32
    with tf.name_scope('avg_pooling'):
        h3 = tf.reduce_mean(h2, 1)
    return h3

In [None]:
def init_weights(shape, name):
    return tf.compat.v1.get_variable(name=name, shape=shape, dtype=tf.float32,
                           initializer=slim.variance_scaling_initializer(
                               factor=1.0, mode='FAN_AVG', uniform=True))

def init_bias(shape, name):
    if len(shape) > 1:
        raise Exception('Bias should be a vector.')
    return tf.compat.v1.get_variable(name=name, shape=shape, dtype=tf.float32,
                           initializer=tf.constant_initializer(
                               0.01))

In [None]:
def classification(X1, X2, dropout, phase):
    global reg_term
    with tf.compat.v1.variable_scope('encoding') as scope:
        h31 = model(X1, dropout, phase)
        scope.reuse_variables()
        h32 = model(X2, dropout, phase)
    h41 = tf.concat(values=[h31, h32], axis=1)
    with tf.name_scope('fc_layer1_1'):
        w5 = init_weights([128, 32], 'w5')  # 64 16
        reg_term += tf.nn.l2_loss(w5)
        b5 = init_bias([32], 'b5')
        h5_1 = tf.nn.bias_add(tf.matmul(h41, w5), b5)
        h5_1 = batch_act(h5_1, phase=phase, act=tf.nn.elu,
                         scope='fc_layer1_1_bn')
    h42 = tf.concat(values=[h32, h31], axis=1)
    with tf.name_scope('fc_layer1_2'):
        h5_2 = tf.nn.bias_add(tf.matmul(h42, w5), b5)
        h5_2 = batch_act(h5_2, phase=phase, act=tf.nn.elu,
                         scope='fc_layer1_2_bn')
    h5 = (h5_1 + h5_2) / 2.
    with tf.name_scope('sm_layer'):
        w7 = init_weights([32, 2], 'w7')
        reg_term += tf.nn.l2_loss(w7)
        variable_summaries(w7)
        o = tf.matmul(h5, w7)
    return o

In [None]:
def make_pairs_10_fold(X, Y, pos_ratio = 1.0, neg_ratio=1.0, add_all_neg=False):
    indices = np.random.permutation(np.shape(Y)[0])
    X = np.array(X)[indices]
    Y = np.array(Y, dtype=np.int)[indices]
    y_dist = np.bincount(Y)
    positive_count = reduce(lambda n1, n2: n1+n2, map(lambda num: num*num/2,
                                          y_dist.tolist()))
    X_left = []
    X_right = []
    trainY = []
    p = positive_count * neg_ratio * pos_ratio / (len(X) * len(X) / 2)
    for i in range(len(X)):
        for j in range(i + 1, len(X)):
            if Y[i] == Y[j] and np.random.rand(1)[0] <= pos_ratio:
                X_left.append(X[i])
                X_right.append(X[j])
                trainY.append([0, 1])
            elif np.random.rand(1)[0] <= p or add_all_neg:
                X_left.append(X[i])
                X_right.append(X[j])
                trainY.append([1, 0])

    indices = np.random.permutation(np.shape(trainY)[0])
    sample_X_left = np.array(X_left)[indices]
    sample_X_right = np.array(X_right)[indices]
    sample_Y = np.array(trainY, dtype=np.float32)[indices]
    return sample_X_left, sample_X_right, sample_Y

In [None]:
def stat(Y, predicted_Y, fout=None):
    real_positive_count = 0
    predict_positive_count = 0
    recall = 0
    precision = 0
    print(f"True Predictions: {predicted_Y.sum()}")
    for i in range(Y.shape[0]):
        if Y[i] == 1:
            real_positive_count += 1
            if predicted_Y[i] == 1:
                recall += 1
        if predicted_Y[i] == 1:
            predict_positive_count += 1
            if Y[i] == 1:
                precision += 1
    retrieved_positive_count = recall
    recall /= real_positive_count * 1.0
    precision /= max(predict_positive_count * 1.0, 1.0)
    f1_score = 2 * recall * precision / max(
    recall + precision, 0.00001)
    print(f"Clone pairs: {real_positive_count}, non-clone pairs: {Y.shape[0] - real_positive_count}")
    print(f"Recall: {recall}, precision: {precision}, f1 score: {f1_score}")
    print(f"Predicted_positive_count: {predict_positive_count}, recall truly positive: {retrieved_positive_count}")
    print(f"false positive: {predict_positive_count - retrieved_positive_count}")
    print(f"missed true positive: {real_positive_count - retrieved_positive_count}")
    if fout is not None:
        fout.write(f"Clone pairs: {real_positive_count}, non-clone pairs: {Y.shape[0] - real_positive_count}\n")
        fout.write(f"Recall: {recall:.4f}, precision: {precision:.4f}, f1 score: {f1_score:.4f}\n")
        fout.write(f"Predicted_positive_count: {predict_positive_count}, recall truly positive: {retrieved_positive_count}, ")
        fout.write(f"false positive: {predict_positive_count - retrieved_positive_count}, missed true positive: {real_positive_count - retrieved_positive_count}\n")
    return recall, precision, f1_score

In [None]:
tf.compat.v1.disable_eager_execution()

In [None]:
with tf.name_scope('input'):
    X_left = tf.compat.v1.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
    X_right = tf.compat.v1.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
    Y = tf.compat.v1.placeholder(tf.float32, [None, 2])
dropout = tf.compat.v1.placeholder(tf.float32)
phase = tf.compat.v1.placeholder(tf.bool, name='phase')
sample_weights = tf.compat.v1.placeholder(tf.float32, [batch_size])

In [None]:
py_x = classification(X_left, X_right, dropout, phase)

In [None]:
py_x

<tf.Tensor 'sm_layer/MatMul:0' shape=(None, 2) dtype=float32>

In [None]:
cost = tf.reduce_mean(
    tf.compat.v1.losses.softmax_cross_entropy(logits=py_x, onehot_labels=Y,
                                    weights=sample_weights))

In [None]:
tf.summary.scalar('cost', cost)
cost = tf.reduce_mean(cost + beta * reg_term)
update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)

In [None]:
with tf.control_dependencies(update_ops):
    train_op = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001).minimize(
        cost)
    predict_op = tf.argmax(py_x, 1)

In [None]:
skf = StratifiedKFold(n_splits=10)

In [None]:
file_path = "/content/deepsim/dataset/g4_128.npy"
dataset = np.load(open(file_path, 'rb'), allow_pickle=True)
X, y = np.array(dataset['X']), np.array(dataset['y'], dtype='int')

# shuffle
indices = np.random.permutation(X.shape[0])
X = X[indices]
y = y[indices]
fold_index = 0
avg_accuracy = 0.
avg_recall = 0.
avg_precision = 0.
avg_f1_score = 0.

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# !rm -rf 10_fold_balanced/
if os.path.exists('result') is not True:
    os.mkdir("result")
if os.path.exists("10_fold_balanced") is not True:
    os.mkdir("10_fold_balanced")
fout = open('result/10_fold_balanced.txt', 'w')

In [None]:
from math import ceil
from tqdm.notebook import tqdm

# tf.compat.v1.summary.merge_all

In [None]:
fold_index = 0

for train_idx, test_idx in skf.split(X, y):
    print ('*' * 40 + str(fold_index) + '*' * 40)
    fold_path = os.path.join("10_fold_balanced", str(fold_index))
    if os.path.exists(fold_path) is not True:
        os.mkdir(fold_path)
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    train_X_left, train_X_right, train_Y = \
        make_pairs_10_fold(X_train, y_train, neg_ratio=10.0,
                                          pos_ratio=1.0, add_all_neg=True)
    test_X_left, test_X_right, test_Y = \
        make_pairs_10_fold(X_test, y_test, neg_ratio=1.0,
                                          pos_ratio=1.0, add_all_neg=True)

    # compute the class weights
    classes_numbers = np.bincount(np.argmax(train_Y, axis=1))
    classes_weights = np.array([classes_numbers[1] * 2.0 /
                                  (classes_numbers[0] + classes_numbers[1]),
                                  classes_numbers[0] * 1.0 /
                                  (classes_numbers[0] + classes_numbers[1])],
                                dtype=np.float32)
    classes_weights = np.reshape(classes_weights, newshape=[2,1])

    t_beg = time.process_time()
    # tf.reset_default_graph() # reset the model
    with tf.compat.v1.Session() as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        sess.run(tf.compat.v1.local_variables_initializer())
        merged = tf.compat.v1.summary.merge_all()
        train_writer = tf.compat.v1.summary.FileWriter(
            logdir, sess.graph)
        saver = tf.compat.v1.train.Saver(max_to_keep=3)
        for epoch in range(4):
            # re-shuffle for each epoch
            # print(f"epoch: {epoch}")
            indices = np.random.permutation(train_X_left.shape[0])
            train_X_left = train_X_left[indices]
            train_X_right = train_X_right[indices]
            train_Y = train_Y[indices]
            # for small test
            dense_test_X_left = from_sparse_arrs(test_X_left[0:test_size])
            dense_test_X_right = from_sparse_arrs(test_X_right[0:test_size])
            n_examples = train_X_left.shape[0]
            n_batches = ceil(n_examples/batch_size)
            step = 0
            for start, end in tqdm(zip(
                    range(0, np.shape(train_X_left)[0], batch_size),
                    range(batch_size, np.shape(train_X_left)[0] + 1,
                          batch_size))):
                dense_train_X_left = from_sparse_arrs(
                    train_X_left[start:end])
                dense_train_X_right = from_sparse_arrs(
                    train_X_right[start:end])
                batch_samples_weights = np.matmul(train_Y[start:end],
                                                  classes_weights)
                batch_samples_weights = np.reshape(batch_samples_weights,
                                                    newshape=[batch_size])
                _ = sess.run([train_op],
                                      feed_dict={X_left: dense_train_X_left,
                                                  X_right: dense_train_X_right,
                                                  Y: train_Y[start:end],
                                                  sample_weights:
                                                      batch_samples_weights,
                                                  dropout: keep_prob,
                                                  phase: 1})
                print('epoch %d, iteration %d/%d\n' % (epoch, step, n_batches))
                step += 1
                # if step % 100 == 0 and step != 0:
                #     batch_samples_weights = np.matmul(test_Y[:test_size],
                #                                       classes_weights)
                #     batch_samples_weights = np.reshape(
                #         batch_samples_weights,
                #         newshape=[test_size])
                #     predict_Y, summary = sess.run([predict_op, merged],
                #                           feed_dict={
                #                               X_left: dense_test_X_left,
                #                               X_right: dense_test_X_right,
                #                               Y: test_Y[:test_size],
                #                               sample_weights:batch_samples_weights,
                #                               dropout: 1.0,
                #                               phase: 0})  # no dropout
                #     train_writer.add_summary(summary, step)
                #     print(epoch, np.mean(
                #         np.argmax(test_Y[:test_size], axis=1) == predict_Y))
        saver.save(sess, os.path.join(fold_path, 'mode.ckpt'))
        print("model saved.")
        t_end = time.process_time()
        print('Time cost: %.2f' % (t_end - t_beg))

        # validation
        overall_accuracy = 0.
        overall_predict_Y = []
        iter = 0

        print(f"{'*'*20}Evaluation{'*'*20}")
        for start, end in zip(
                range(0, np.shape(test_X_left)[0], batch_size),
                range(batch_size, np.shape(test_X_left)[0] + 1,
                      batch_size)):
            dense_test_X_left = from_sparse_arrs(test_X_left[start:end])
            dense_test_X_right = from_sparse_arrs(test_X_right[start:end])
            predict_Y = sess.run(predict_op,
                                  feed_dict={X_left: dense_test_X_left,
                                            X_right: dense_test_X_right,
                                            dropout: 1.0,
                                            phase: 0})  # no dropout
            overall_predict_Y.extend(predict_Y.tolist())
            accuracy = np.mean(
                np.argmax(test_Y[start:end], axis=1) == predict_Y)
            iter += 1
            overall_accuracy += accuracy

        print('Overall accuracy: %.5f' % (overall_accuracy / iter))
        t_end = time.process_time()
        print('Time cost: %.2f' % (t_end - t_beg))
        fout.write('*' * 80 + '\n')
        fout.write('Fold %d:\n' % (fold_index))
        fout.write('Overall accuracy: %.5f\n' % (overall_accuracy / iter))
        fout.write('Time cost: %.2f\n' % (t_end - t_beg))
        recall, precision, f1_score = stat(
            np.argmax(test_Y[:len(overall_predict_Y)], axis=1),
            np.array(overall_predict_Y, dtype='int'), fout=fout)
        fout.flush()
        avg_accuracy += overall_accuracy / iter
        avg_recall += recall
        avg_precision += precision
        avg_f1_score += f1_score
    print('*' * 80)
    fold_index += 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


****************************************0****************************************




0it [00:00, ?it/s]

epoch 0, iteration 0/4404

epoch 0, iteration 1/4404

epoch 0, iteration 2/4404

epoch 0, iteration 3/4404

epoch 0, iteration 4/4404

epoch 0, iteration 5/4404

epoch 0, iteration 6/4404

epoch 0, iteration 7/4404

epoch 0, iteration 8/4404

epoch 0, iteration 9/4404

epoch 0, iteration 10/4404

epoch 0, iteration 11/4404

epoch 0, iteration 12/4404

epoch 0, iteration 13/4404

epoch 0, iteration 14/4404

epoch 0, iteration 15/4404

epoch 0, iteration 16/4404

epoch 0, iteration 17/4404

epoch 0, iteration 18/4404

epoch 0, iteration 19/4404

epoch 0, iteration 20/4404

epoch 0, iteration 21/4404

epoch 0, iteration 22/4404

epoch 0, iteration 23/4404

epoch 0, iteration 24/4404

epoch 0, iteration 25/4404

epoch 0, iteration 26/4404

epoch 0, iteration 27/4404

epoch 0, iteration 28/4404

epoch 0, iteration 29/4404

epoch 0, iteration 30/4404

epoch 0, iteration 31/4404

epoch 0, iteration 32/4404

epoch 0, iteration 33/4404

epoch 0, iteration 34/4404

epoch 0, iteration 35/4404

ep

In [None]:
avg_accuracy /= 10.0
avg_precision /= 10.0
avg_recall /= 10.0
avg_f1_score /= 10.0
print('Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
      'score: %.4f' % (
      avg_accuracy, avg_recall, avg_precision, avg_f1_score))
fout.write('*' * 80 + '\n')
fout.write(
    'Avg accuracy: %.4f, avg recall: %.4f, avg precision: %.4f, avg f1 '
    'score: %.4f' % (avg_accuracy, avg_recall, avg_precision, avg_f1_score))
fout.close()