In [200]:
import pandas as pd
import sys
sys.path.append("/home/RDC/zinovyee.hub/H:/zinovyee.hub/IRTG/MLSC/MLSC_DD/")
import numpy as np
import pickle
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from transformers import AutoTokenizer
import os
from scipy.sparse import hstack

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from sklearn.linear_model import LogisticRegression

ANALYSIS_POSTFIX = "mined_no_drift_2024-09-09"

In [201]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/s2_model_results.pickle", "rb") as handle:
    s2_predictions = pickle.load(handle)


In [77]:
# model_name = "Salesforce/codet5-base-multi-sum"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# cv_predictions["input_sequence"] = cv_predictions["input_sequence"].apply(lambda x: " ".join(tokenizer.tokenize(x)))
# test_predictions["input_sequence"] = test_predictions["input_sequence"].apply(lambda x: " ".join(tokenizer.tokenize(x)))

In [202]:
def vectorize_input_df(df: pd.DataFrame,
                       vectorizer: TfidfVectorizer,
                       acc_rouge: float=0.25,
                       fit: bool=True) -> dict:
    
    X = df.loc[:, "input_sequence"]
    if fit:
        X = vectorizer.fit_transform(X)
    else: 
        X = vectorizer.transform(X)

    X =  hstack([X, df[["cluster"]]])

    print(X.shape)

    y = (df.loc[:, "rouge"]>=acc_rouge).astype(int)

    return {"X": X,
            "y": y, 
            "vectorizer": vectorizer}

def train_classifier(X_train: pd.DataFrame,
                     y_train:pd.DataFrame) -> CatBoostClassifier:
    
    classifier = CatBoostClassifier(iterations=1000, )
    classifier.fit(X=X_train, y=y_train)
    return classifier

def classifier_inference(classifier: CatBoostClassifier,
                         X: pd.DataFrame,
                         vectorizer: TfidfVectorizer): 
    vectorized = vectorize_input_df(df=X, vectorizer=vectorizer, fit=False)
    X = vectorized["X"]

    probs = classifier.predict(X)
    pp = 1 - probs
    probs = np.column_stack((pp, probs))
    return probs

In [203]:
t_models = [0, 1, 2, 5, 10, 'cluster_[0]', 'cluster_[3]', 'cluster_[0, 3]']

In [204]:
cv_predictions["acc_rouge"] = (cv_predictions.rouge>=0.27).astype(int)

In [205]:
th = {0:0.34, 1:0.48, 2:0.48, 5:0.49, 10:0.49, 'cluster_[0]':0.48, 'cluster_[3]':0.49, 'cluster_[0, 3]':0.43}

In [207]:
vectorizer = TfidfVectorizer(max_features=45000)
classifier = CatBoostClassifier(iterations=10000, early_stopping_rounds=5000)


res = vectorize_input_df(df=cv_predictions, vectorizer=vectorizer, fit=True, acc_rouge=0.27)
X, y, vectorizer = res["X"], res["y"], res["vectorizer"]

res = vectorize_input_df(df=test_predictions, vectorizer=vectorizer, fit=False, acc_rouge=0.27)
X_test, y_test, vectorizer = res["X"], res["y"], res["vectorizer"]

classifier.fit(X, y, eval_set=(X_test, y_test))

preds = classifier.predict_proba(X_test)[:,1]
auc_roc = roc_auc_score(y_true= y_test, y_score=preds)

acc = {}

accuracies = []
for th in [0.36, 0.38, 0.4, 0.42, 0.44, 0.46 ]:
    accuracies.append(accuracy_score(y_true= y_test, y_pred=(preds>=th).astype(int)))

print({"auc_roc" : auc_roc, "accuracy" : accuracies})

(63000, 7186)
(20000, 7186)
Learning rate set to 0.032385
0:	learn: 0.6914755	test: 0.6915390	best: 0.6915390 (0)	total: 13.4ms	remaining: 2m 13s
1:	learn: 0.6899761	test: 0.6901409	best: 0.6901409 (1)	total: 24.5ms	remaining: 2m 2s
2:	learn: 0.6885748	test: 0.6887689	best: 0.6887689 (2)	total: 35.3ms	remaining: 1m 57s
3:	learn: 0.6872213	test: 0.6874706	best: 0.6874706 (3)	total: 46.3ms	remaining: 1m 55s
4:	learn: 0.6860600	test: 0.6863722	best: 0.6863722 (4)	total: 57.2ms	remaining: 1m 54s
5:	learn: 0.6849754	test: 0.6853444	best: 0.6853444 (5)	total: 67.5ms	remaining: 1m 52s
6:	learn: 0.6838622	test: 0.6842334	best: 0.6842334 (6)	total: 78.3ms	remaining: 1m 51s
7:	learn: 0.6828921	test: 0.6833027	best: 0.6833027 (7)	total: 90.1ms	remaining: 1m 52s
8:	learn: 0.6819449	test: 0.6824506	best: 0.6824506 (8)	total: 101ms	remaining: 1m 52s
9:	learn: 0.6810347	test: 0.6816330	best: 0.6816330 (9)	total: 112ms	remaining: 1m 51s
10:	learn: 0.6801805	test: 0.6808054	best: 0.6808054 (10)	total: 

In [183]:
fold_results = []

for fold in list(cv_predictions.fold.unique()):

    train_fold_preds = cv_predictions.loc[cv_predictions.fold!=fold,:]
    test_fold_preds = cv_predictions.loc[cv_predictions.fold==fold,:]

    classifiers = {}

    for model_set in t_models:

        vectorizer = TfidfVectorizer(max_features=10000)
        res = vectorize_input_df(df=train_fold_preds[train_fold_preds.model_set==model_set], vectorizer=vectorizer, fit=True, acc_rouge=0.27)
        X, y, vectorizer = res["X"], res["y"], res["vectorizer"]

        classifier = train_classifier(X, y)
        classifiers[model_set] = classifier

        res = vectorize_input_df(df=test_fold_preds[test_fold_preds.model_set==model_set], vectorizer=vectorizer, fit=False, acc_rouge=0.27)
        X_test, y_test, vectorizer = res["X"], res["y"], res["vectorizer"]

        preds = classifier.predict_proba(X_test)[:,1]
        auc_roc = roc_auc_score(y_true= y_test, y_score=preds)
        av_prec = average_precision_score(y_true= y_test, y_score=preds)

        classifiers[model_set] = {"auc_roc" : auc_roc, "av_prec": av_prec}

    fold_results.append(classifier)
    break

(4666, 5396)
Learning rate set to 0.164276
0:	learn: 0.6031544	total: 9.16ms	remaining: 907ms
1:	learn: 0.5451914	total: 15.8ms	remaining: 774ms
2:	learn: 0.5067829	total: 22.5ms	remaining: 727ms
3:	learn: 0.4795235	total: 29.2ms	remaining: 700ms
4:	learn: 0.4613664	total: 35.6ms	remaining: 676ms
5:	learn: 0.4491116	total: 42.5ms	remaining: 665ms
6:	learn: 0.4409557	total: 48.9ms	remaining: 650ms
7:	learn: 0.4338194	total: 55.6ms	remaining: 639ms
8:	learn: 0.4294259	total: 62ms	remaining: 627ms
9:	learn: 0.4257444	total: 68.6ms	remaining: 618ms
10:	learn: 0.4229567	total: 75.1ms	remaining: 608ms
11:	learn: 0.4200324	total: 81.5ms	remaining: 598ms
12:	learn: 0.4182257	total: 88.2ms	remaining: 590ms
13:	learn: 0.4158736	total: 94.8ms	remaining: 583ms
14:	learn: 0.4142414	total: 101ms	remaining: 575ms
15:	learn: 0.4134170	total: 108ms	remaining: 566ms
16:	learn: 0.4124647	total: 114ms	remaining: 559ms
17:	learn: 0.4116615	total: 121ms	remaining: 551ms
18:	learn: 0.4105292	total: 128ms	rem

In [185]:
vectorizer = TfidfVectorizer(max_features=45000)
res = vectorize_input_df(df=cv_predictions, vectorizer=vectorizer, fit=True, acc_rouge=0.27)
X, y, vectorizer = res["X"], res["y"], res["vectorizer"]


classifier = train_classifier(X, y)
classifiers[model_set] = classifier

res = vectorize_input_df(df=test_predictions, vectorizer=vectorizer, fit=False, acc_rouge=0.27)
X_test, y_test, vectorizer = res["X"], res["y"], res["vectorizer"]

preds = classifier.predict_proba(X_test)[:,1]
auc_roc = roc_auc_score(y_true= y_test, y_score=preds)

acc = {}
for model_set in t_models:
    accuracies = []
    for th in [0.36, 0.38, 0.4, 0.42, 0.44, 0.46 ]:
        accuracies.append(accuracy_score(y_true= y_test, y_pred=(preds>=th).astype(int)))
    acc[model_set] = accuracies

print({"auc_roc" : auc_roc, "accuracy" : acc})

(63000, 7186)
Learning rate set to 0.499175
0:	learn: 0.6738302	total: 13.3ms	remaining: 1.32s
1:	learn: 0.6673587	total: 25.2ms	remaining: 1.24s
2:	learn: 0.6651176	total: 36.1ms	remaining: 1.17s
3:	learn: 0.6627401	total: 46.6ms	remaining: 1.12s
4:	learn: 0.6612091	total: 59.2ms	remaining: 1.13s
5:	learn: 0.6600677	total: 69.8ms	remaining: 1.09s
6:	learn: 0.6587466	total: 80.6ms	remaining: 1.07s
7:	learn: 0.6573492	total: 90.7ms	remaining: 1.04s
8:	learn: 0.6564803	total: 101ms	remaining: 1.02s
9:	learn: 0.6553159	total: 112ms	remaining: 1.01s
10:	learn: 0.6543149	total: 124ms	remaining: 1s
11:	learn: 0.6532361	total: 135ms	remaining: 988ms
12:	learn: 0.6522681	total: 146ms	remaining: 974ms
13:	learn: 0.6515139	total: 156ms	remaining: 960ms
14:	learn: 0.6506891	total: 167ms	remaining: 945ms
15:	learn: 0.6499380	total: 177ms	remaining: 931ms
16:	learn: 0.6491651	total: 188ms	remaining: 916ms
17:	learn: 0.6482683	total: 198ms	remaining: 901ms
18:	learn: 0.6472156	total: 208ms	remaining

In [191]:
classifiers = {}

for model_set in t_models:
    print(model_set)
    vectorizer = TfidfVectorizer(max_features=15000)
    res = vectorize_input_df(df=cv_predictions[cv_predictions.model_set==model_set], vectorizer=vectorizer, fit=True, acc_rouge=0.27)
    X, y, vectorizer = res["X"], res["y"], res["vectorizer"]


    classifier = train_classifier(X, y)
    classifiers[model_set] = classifier

    res = vectorize_input_df(df=test_predictions[test_predictions.model_set==model_set], vectorizer=vectorizer, fit=False, acc_rouge=0.27)
    X_test, y_test, vectorizer = res["X"], res["y"], res["vectorizer"]

    preds = classifier.predict_proba(X_test)[:,1]
    auc_roc = roc_auc_score(y_true= y_test, y_score=preds)

    accuracies = []
    for th in [ 0.32, 0.34,0.46, 0.5, 0.52, 0.56]:
        accuracies.append(accuracy_score(y_true= y_test, y_pred=(preds>=th).astype(int)))
    #acc[model_set] = accuracies
    #acc= accuracy_score(y_true= y_test, y_pred=(preds>=th[model_set]).astype(int))

    classifiers[model_set] = {"auc_roc" : auc_roc, "accuracy" : accuracies}

0
(7000, 7186)
Learning rate set to 0.023648
0:	learn: 0.6794958	total: 10.9ms	remaining: 10.9s
1:	learn: 0.6665586	total: 18.8ms	remaining: 9.36s
2:	learn: 0.6540889	total: 26.4ms	remaining: 8.79s
3:	learn: 0.6423846	total: 33.9ms	remaining: 8.45s
4:	learn: 0.6313938	total: 42ms	remaining: 8.36s
5:	learn: 0.6210195	total: 50.1ms	remaining: 8.29s
6:	learn: 0.6108998	total: 58ms	remaining: 8.23s
7:	learn: 0.6015428	total: 65.7ms	remaining: 8.14s
8:	learn: 0.5927182	total: 73.1ms	remaining: 8.05s
9:	learn: 0.5843756	total: 80.9ms	remaining: 8.01s
10:	learn: 0.5764494	total: 88.7ms	remaining: 7.98s
11:	learn: 0.5689393	total: 96.3ms	remaining: 7.93s
12:	learn: 0.5617883	total: 104ms	remaining: 7.9s
13:	learn: 0.5549077	total: 112ms	remaining: 7.88s
14:	learn: 0.5483397	total: 120ms	remaining: 7.86s
15:	learn: 0.5420814	total: 128ms	remaining: 7.86s
16:	learn: 0.5359814	total: 135ms	remaining: 7.82s
17:	learn: 0.5302309	total: 143ms	remaining: 7.8s
18:	learn: 0.5250557	total: 151ms	remaini

In [192]:
classifiers

{0: {'auc_roc': 0.6635358146067416,
  'accuracy': [0.8452, 0.85, 0.8548, 0.8552, 0.856, 0.8576]},
 1: {'auc_roc': 0.6198661224466253,
  'accuracy': [0.6352, 0.6516, 0.674, 0.6752, 0.6756, 0.6748]},
 2: {'auc_roc': 0.6221824459029612,
  'accuracy': [0.6176, 0.6344, 0.6596, 0.6528, 0.6536, 0.6552]},
 5: {'auc_roc': 0.6058339348099929,
  'accuracy': [0.5756, 0.6092, 0.6488, 0.6444, 0.644, 0.6428]},
 10: {'auc_roc': 0.6093397593376175,
  'accuracy': [0.5068, 0.5912, 0.6348, 0.636, 0.6384, 0.6356]},
 'cluster_[0]': {'auc_roc': 0.6128080268474763,
  'accuracy': [0.5636, 0.5944, 0.6376, 0.6344, 0.6348, 0.6368]},
 'cluster_[3]': {'auc_roc': 0.6379537355855343,
  'accuracy': [0.74, 0.7532, 0.7716, 0.772, 0.7732, 0.7716]},
 'cluster_[0, 3]': {'auc_roc': 0.6248314950516345,
  'accuracy': [0.6108, 0.638, 0.7028, 0.7052, 0.7052, 0.7036]}}