In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
from itertools import islice
import csv
from typing import List
import json

import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier

import mmr.ds_loading
import mmr.vectorization
import cubert_wrapper

In [3]:
def vectorizer(x):
    return x.mean(axis=0)

In [86]:
data_root = Path('/home/maxkvant/data/mmr/')

test_classes_vecs_path = data_root / 'mmr_vecs_np'
test_methods_vecs_path = data_root / 'mmr_vecs_wm'
test_ds = mmr.ds_loading.MMRDataset(data_root / 'MoveMethodDataset', test_methods_vecs_path, test_classes_vecs_path, 
                                    vectorizer, vectorizer, True, True, precalculated=True)

train_classes_vecs_path = data_root / 'mmr_tr_dsv'
train_methods_vecs_path = data_root / 'mmr_tr_dsvwm_v2'
train_ds = mmr.ds_loading.MMRDataset(data_root / 'mmr_tr_ds', train_methods_vecs_path, train_classes_vecs_path, 
                                     vectorizer, vectorizer, True, True, precalculated=True)

In [87]:
c_tr = {i[0] for i in train_ds}
c_te = {i[0] for i in test_ds}
tr_exclude_projects = c_tr & c_te
tr_exclude_projects

{'actor-platform',
 'atlas',
 'buck',
 'crate',
 'deeplearning4j',
 'drools',
 'hbase',
 'hive',
 'jenkins',
 'jstorm',
 'pinpoint',
 'pmd'}

In [5]:
# tr_exclude_projects = {'actor-platform', 'atlas', 'bazel', 'buck', 'crate', 'deeplearning4j', 'drools', 'hbase', 'hive', 
#                        'jenkins', 'jstorm', 'pinpoint', 'pmd'}

In [8]:
proj_train, proj_val = train_test_split(list(c_tr - tr_exclude_projects))

In [88]:
def ds_to_xy(ds, val_projects, exclude_projects = {}):
    x_train, x_val, y_train, y_val = [], [], [], []
    mn_train, mn_val = [], []
    for project, mn, _, mv, cv, tgt in ds:
        vec = np.concatenate((mv, cv))
        if project in val_projects:
            x_val.append(vec)
            y_val.append(tgt)
            mn_val.append(mn)
        elif project not in exclude_projects:
            x_train.append(vec)
            y_train.append(tgt)
            mn_train.append(mn)
    return np.array(x_train), np.array(x_val), np.int64(y_train), np.int64(y_val), mn_train, mn_val


x_train, x_val, y_train, y_val, mn_train, mn_val = ds_to_xy(train_ds, proj_val, tr_exclude_projects)
x_test, _, y_test, _, mn_test, _ = ds_to_xy(test_ds, {})

In [19]:
svm_clf = SVC(C=10)
svm_clf.fit(x_train, y_train)
f1_score(y_val, svm_clf.predict(x_val)), roc_auc_score(y_val, svm_clf.predict(x_val))

(0.771722625274169, 0.753852142887309)

In [7]:
gbc = CatBoostClassifier(task_type='CPU', max_depth=10, od_pval=1e-3)
gbc.fit(x_train, y_train, eval_set=(x_val, y_val))
f1_score(y_val, gbc.predict(x_val)), roc_auc_score(y_val, gbc.predict(x_val))

Learning rate set to 0.046785
0:	learn: 0.6555173	test: 0.6762403	best: 0.6762403 (0)	total: 1.55s	remaining: 25m 51s
1:	learn: 0.6267027	test: 0.6600211	best: 0.6600211 (1)	total: 3.04s	remaining: 25m 17s
2:	learn: 0.5991910	test: 0.6421295	best: 0.6421295 (2)	total: 4.47s	remaining: 24m 47s
3:	learn: 0.5728015	test: 0.6326780	best: 0.6326780 (3)	total: 5.91s	remaining: 24m 31s
4:	learn: 0.5489978	test: 0.6169663	best: 0.6169663 (4)	total: 7.39s	remaining: 24m 30s
5:	learn: 0.5296132	test: 0.6078033	best: 0.6078033 (5)	total: 8.82s	remaining: 24m 21s
6:	learn: 0.5107164	test: 0.5923723	best: 0.5923723 (6)	total: 10.2s	remaining: 24m 13s
7:	learn: 0.4968553	test: 0.5831999	best: 0.5831999 (7)	total: 11.7s	remaining: 24m 9s
8:	learn: 0.4801428	test: 0.5762454	best: 0.5762454 (8)	total: 13.2s	remaining: 24m 9s
9:	learn: 0.4641208	test: 0.5683839	best: 0.5683839 (9)	total: 14.6s	remaining: 24m 7s
10:	learn: 0.4509662	test: 0.5581918	best: 0.5581918 (10)	total: 16.1s	remaining: 24m 5s
11:	

(0.7858407079646017, 0.7969087991068313)

In [55]:
import torch
from torch import nn

In [89]:
train_dl = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(x_train), torch.tensor(y_train)), 
                                       batch_size=2048)
val_dl = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(x_val), torch.tensor(y_val)), batch_size=2048)
test_dl = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(x_test), torch.tensor(y_test)), batch_size=2048)

In [95]:
nn_clf = nn.Sequential(
    nn.Linear(2048, 4096), 
    nn.PReLU(),
    nn.Linear(4096, 4096),
    nn.PReLU(),
    nn.Linear(4096, 4096),
    nn.PReLU(),
    nn.Linear(4096, 1),
    nn.Sigmoid()
)

In [96]:
loss_function = nn.BCELoss()
opt = torch.optim.Adam(nn_clf.parameters(), lr=1e-4, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, 100, .1)
save_path = 'avg_pm_mlp.pth'

In [97]:
n_epochs = 100

for epoch in range(n_epochs):
    train_losses, val_losses, val_f1s = [], [], []
    best_score = -1
    for x, y in train_dl:
        opt.zero_grad()
        pred = nn_clf(x)
        y = y.unsqueeze(1).float()
        loss = loss_function(pred, y)
        loss.backward()
        opt.step()
        train_losses.append(loss.detach())
    lr_scheduler.step()
        
    with torch.no_grad():
        for x, y in val_dl:
            pred = nn_clf(x)
            y = y.unsqueeze(1).float()
            loss = loss_function(pred, y)
            val_losses.append(loss.detach())
            val_f1s.append(f1_score(y, pred > .5))
    train_loss = torch.tensor(train_losses).mean()
    val_loss = torch.tensor(val_losses).mean()
    val_f1 = torch.tensor(val_f1s).mean()
    if val_f1 > best_score:
        best_score = val_f1
        torch.save(nn_clf.state_dict(), save_path)
    print(f'Epoch {epoch} loss: train {train_loss:.4f} val {val_loss:.4f} f1: {val_f1:.4f}')

Epoch 0 loss: train 0.6675 val 0.6140 f1: 0.6021
Epoch 1 loss: train 0.5952 val 0.5746 f1: 0.6961
Epoch 2 loss: train 0.5567 val 0.5467 f1: 0.7613
Epoch 3 loss: train 0.5378 val 0.5328 f1: 0.7523
Epoch 4 loss: train 0.5203 val 0.5194 f1: 0.7633
Epoch 5 loss: train 0.5011 val 0.5059 f1: 0.7631
Epoch 6 loss: train 0.4995 val 0.5404 f1: 0.7855
Epoch 7 loss: train 0.4898 val 0.5007 f1: 0.7922
Epoch 8 loss: train 0.4642 val 0.4801 f1: 0.7885
Epoch 9 loss: train 0.4613 val 0.4717 f1: 0.8054
Epoch 10 loss: train 0.4642 val 0.5206 f1: 0.7925
Epoch 11 loss: train 0.4636 val 0.4815 f1: 0.8022
Epoch 12 loss: train 0.4284 val 0.4581 f1: 0.8146
Epoch 13 loss: train 0.4681 val 0.5306 f1: 0.7850
Epoch 14 loss: train 0.4839 val 0.4804 f1: 0.8038
Epoch 15 loss: train 0.4590 val 0.4789 f1: 0.7650
Epoch 16 loss: train 0.4249 val 0.4663 f1: 0.8118
Epoch 17 loss: train 0.4081 val 0.4480 f1: 0.8191
Epoch 18 loss: train 0.4035 val 0.4591 f1: 0.7868
Epoch 19 loss: train 0.4130 val 0.4410 f1: 0.8166
Epoch 20 l

In [100]:
nn_clf.load_state_dict(torch.load('avg_pm_mlp.pth'))
nn_clf.train(False)
with torch.no_grad():
    pred = nn_clf(torch.tensor(x_test))
    print(roc_auc_score(y_test, pred), f1_score(y_test, pred > .5))

0.8561753620140976 0.7315887315887316


In [48]:
from collections import defaultdict


def per_project_f1(mns, tgts, preds, projs):
    m_true_scores = defaultdict(dict)
    m_false_scores = defaultdict(lambda: defaultdict(list))
    for mn, tgt, pred, proj in zip(mns, tgts, preds, projs):
        if tgt:
            m_true_scores[proj][mn] = pred
        else:
            m_false_scores[proj][mn].append(pred)
    f1s = []
    for proj in m_true_scores.keys():
        tp, n_refs, n_methods = 0, 0, 0
        for mn, mts in m_true_scores[proj].items():
            if not m_false_scores[proj][mn]:
                continue
            if max(mts, *m_false_scores[proj][mn]) > .5:
                n_refs += 1
                if mts > max(m_false_scores[proj][mn]):
                    tp += 1
            n_methods += 1
        precision = tp / n_refs
        recall = tp / n_methods
        f1s.append(2 * precision * recall / (precision + recall))
    return np.mean(f1s)

In [46]:
proj_test = [name for (name, *_) in test_ds]

In [101]:
per_project_f1(mn_test, y_test, pred, proj_test)

0.8496250459159477