In [None]:
import sys
sys.path.append('../../')
import os
import itertools
import pickle
import networkx as nx
import math
from typing import List


import numpy as np
import dgl
import torch
from typing import Tuple
import torch.nn.functional as torch_f
import tqdm
import torch.nn as nn
import dgl.function as fn

from src.utils import fit_lr_classifier, infer_lr_classifier, calculate_score, calculate_score_raw
from src.utils import generate_submission
from src.utils import dump_features
from src.models import GraphSAGEBundled

torch.random.manual_seed(0)
np.random.seed(0)
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')


In [None]:
class Voter:
    def __init__(self, estimators: list, weight=None):
        self.estimators_lookup = {name: idx for idx, (name, _) in enumerate(estimators)}
        self.estimators = [item for name, item in estimators]
        if weight is not None:
            self.weight = weight / sum(weight)
        else:
            self.weight = None

    def fit(self, x, y, *args, **kwargs):
        for name, data in x.keys():
            self.estimators[self.estimators_lookup[name]].fit(data, y, *args, **kwargs)
            print(f"fitting: {name}")

    def predict_proba(self, x):
        predictions = [self.estimators[self.estimators_lookup[name]].predict_proba(x[name]) for name in x.keys()]
        summary = 0

        for idx, partial_result in enumerate(predictions):
            if self.weight is not None:
                summary += partial_result * self.weight[idx]
            else:
                summary += 1 / len(self.estimators) * partial_result

        return summary

    def decide(self, x, thresh=0.5):
        predictions = [self.estimators[self.estimators_lookup[name]].predict_proba(x[name]) for name in x.keys()]
        summary = 0

        for idx, partial_result in enumerate(predictions):
            if self.weight is not None:
                summary += partial_result * self.weight[idx]
            else:
                summary += 1 / len(self.estimators) * partial_result

        return summary[:, 1] > 0.5

In [None]:
features = {
    'author_graph_lr': pickle.load(open('./features/author_graph_lr_features.pkl', 'rb')),
    'graphsage_author': pickle.load(open('./features/graphsage_author_features.pkl', 'rb')),
    'graphsage_essay': pickle.load(open('./features/graphsage_essay_features.pkl', 'rb')),
    'baseline_enhanced': pickle.load(open('./features/baseline-enhanced.pkl','rb'))
}
uv_list = pickle.load(open('./uv_list.pkl', 'rb'))

In [None]:
# X_train_1 = np.concatenate([features['graphsage_author']['X_train'], features['graphsage_essay']['X_train']], axis=1)
# X_dev_1 = np.concatenate([features['graphsage_author']['X_dev'], features['graphsage_essay']['X_dev']], axis=1)
# X_test_1 =  np.concatenate([features['graphsage_author']['X_test'], features['graphsage_essay']['X_test']], axis=1)
X_train_0 = features['baseline_enhanced']['X_train']
X_dev_0 = features['baseline_enhanced']['X_dev']
X_test_0 =  features['baseline_enhanced']['X_test']
X_whole_0 = np.concatenate([features['baseline_enhanced']['X_train'],
                            features['baseline_enhanced']['X_dev']],axis=0)

X_train_1 = features['graphsage_essay']['X_train']
X_dev_1 = features['graphsage_essay']['X_dev']
X_test_1 =  features['graphsage_essay']['X_test']
X_whole_1 = np.concatenate([features['graphsage_essay']['X_train'],
                            features['graphsage_essay']['X_dev']],axis=0)


X_train_2 = features['author_graph_lr']['X_train']
X_dev_2 = features['author_graph_lr']['X_dev']
X_test_2 = features['author_graph_lr']['X_test']
X_whole_2 = np.concatenate([features['author_graph_lr']['X_train'],
                            features['author_graph_lr']['X_dev']],axis=0)

Y_train = uv_list['train_y']
Y_dev = uv_list['dev_y']
Y_whole = np.concatenate([Y_train, Y_dev], axis=0)


In [None]:
clf0 = fit_lr_classifier(
    X_train_0,
    Y_train,
    X_dev_0,
    Y_dev,
    solver='lbfgs',
    tol=1e-5,
    max_iter=1000,
    n_jobs=12,
    verbose=1,
)  # weight = 0.9
clf1 = fit_lr_classifier(X_train_1, Y_train, X_dev_1, Y_dev, max_iter=400, n_jobs=12)  # weight = 0.9
clf2 = fit_lr_classifier(X_train_2, Y_train, X_dev_2, Y_dev, max_iter=400, n_jobs=12)


In [None]:
clf0 = fit_lr_classifier(
    X_whole_0,
    Y_whole,
    X_dev_0,
    Y_dev,
    solver='lbfgs',
    tol=1e-5,
    max_iter=1000,
    n_jobs=8,
    verbose=1,
)  # weight = 0.9
clf1 = fit_lr_classifier(
    X_whole_1,
    Y_whole,
    X_dev_1,
    Y_dev,
    tol=1e-5,
    max_iter=600,
    verbose=1,
)  # weight = 0.9
clf2 = fit_lr_classifier(
    X_whole_2, 
    Y_whole, 
    X_dev_2, 
    Y_dev, 
    tol=1e-5,
    max_iter=600, 
    verbose=1,
)


In [None]:
voter = Voter(estimators=[('lr0', clf0), ('lr1', clf1), ('lr2', clf2)],weight=np.array([100,87,88]))
calculate_score(voter, {'lr0': X_dev_0, 'lr1': X_dev_1, 'lr2': X_dev_2}, Y_dev)

scores = infer_lr_classifier(voter, {'lr0': X_test_0, 'lr1': X_test_1, 'lr2': X_test_2})
generate_submission('./outputs', scores, "soft_voting")

In [None]:
pred = voter.decide({'lr0': X_dev_0, 'lr1': X_dev_1, 'lr2': X_dev_2})
calculate_score_raw(pred, Y_dev)

In [None]:
uv_list = pickle.load(open('./uv_list.pkl', 'rb'))

In [None]:
sum(uv_list['train_'] - features['baseline_enhanced']['train_u'])

In [None]:
X_train_3 = np.concatenate([X_train_0, X_train_2], axis=1)
X_dev_3 = np.concatenate([X_dev_0, X_dev_2], axis=1)
X_test_3 =  np.concatenate([X_test_0, X_test_2], axis=1)
X_whole_3 =  np.concatenate([X_whole_0, X_whole_2], axis=1)

clf3 = fit_lr_classifier(
    X_whole_3,
    Y_whole,
    X_dev_3,
    Y_dev,
    solver='lbfgs',
    tol=1e-5,
    max_iter=1000,
    n_jobs=8,
    verbose=1,
)  # weight = 0.9

In [None]:
scores = infer_lr_classifier(clf3, X_test_3)
generate_submission('./outputs', scores, "all_gather")

In [3]:
uv_list = pickle.load(open('./uv_list.pkl', 'rb'))

In [9]:
sum(uv_list['train_'] - features['baseline_enhanced']['train_u'])

0

In [40]:
X_train_3 = np.concatenate([X_train_0, X_train_2], axis=1)
X_dev_3 = np.concatenate([X_dev_0, X_dev_2], axis=1)
X_test_3 =  np.concatenate([X_test_0, X_test_2], axis=1)
X_whole_3 =  np.concatenate([X_whole_0, X_whole_2], axis=1)

clf3 = fit_lr_classifier(
    X_whole_3,
    Y_whole,
    X_dev_3,
    Y_dev,
    solver='lbfgs',
    tol=1e-5,
    max_iter=1000,
    n_jobs=8,
    verbose=1,
)  # weight = 0.9

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           22     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.51377D+06    |proj g|=  4.39656D+08


 This problem is unconstrained.



At iterate   50    f=  8.46908D+05    |proj g|=  5.99474D+06

At iterate  100    f=  6.99621D+05    |proj g|=  2.00496D+06

At iterate  150    f=  6.78824D+05    |proj g|=  8.03980D+06

At iterate  200    f=  5.11588D+05    |proj g|=  1.68195D+05

At iterate  250    f=  5.07812D+05    |proj g|=  2.27037D+07

At iterate  300    f=  4.96959D+05    |proj g|=  6.22148D+06

At iterate  350    f=  3.90439D+05    |proj g|=  2.16010D+06

At iterate  400    f=  3.84732D+05    |proj g|=  4.56389D+06

At iterate  450    f=  3.73698D+05    |proj g|=  2.59223D+06

At iterate  500    f=  3.48914D+05    |proj g|=  1.15768D+06

At iterate  550    f=  3.48515D+05    |proj g|=  2.14345D+06

At iterate  600    f=  3.46434D+05    |proj g|=  6.39619D+06

At iterate  650    f=  3.15244D+05    |proj g|=  1.69585D+06

At iterate  700    f=  3.15006D+05    |proj g|=  6.81987D+04

At iterate  750    f=  3.14909D+05    |proj g|=  1.41830D+05

At iterate  800    f=  3.14865D+05    |proj g|=  4.05193D+05

At iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=12)]: Done   1 out of   1 | elapsed:  7.2min finished


Loss: 0.14133211323101885, Accuracy: 0.9428112880109528, F1-score: 0.9417685990101665


In [42]:
scores = infer_lr_classifier(clf3, X_test_3)
generate_submission('./outputs', scores, "all_gather")

107000.0it [00:00, 1236593.04it/s]        
