In [1]:
import sys
sys.path.append('../../')
import os
import itertools
import pickle
import networkx as nx
import math
from typing import List


import numpy as np
import dgl
import torch
from typing import Tuple
import torch.nn.functional as torch_f
import tqdm
import torch.nn as nn
import dgl.function as fn

from src.utils import fit_lr_classifier, infer_lr_classifier, calculate_score, calculate_score_raw
from src.utils import generate_submission
from src.utils import dump_features
from src.models import GraphSAGEBundled

torch.random.manual_seed(0)
np.random.seed(0)
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')


In [2]:
features = {
    'author_graph_lr': pickle.load(open('./features/author_graph_lr_features.pkl', 'rb')),
    'graphsage_author': pickle.load(open('./features/graphsage_author_features.pkl', 'rb')),
    'graphsage_essay': pickle.load(open('./features/graphsage_essay_features.pkl', 'rb')),
    'baseline_enhanced': pickle.load(open('./features/baseline-enhanced.pkl','rb'))
}
uv_list = pickle.load(open('./uv_list.pkl', 'rb'))

In [3]:
# X_train_1 = np.concatenate([features['graphsage_author']['X_train'], features['graphsage_essay']['X_train']], axis=1)
# X_dev_1 = np.concatenate([features['graphsage_author']['X_dev'], features['graphsage_essay']['X_dev']], axis=1)
# X_test_1 =  np.concatenate([features['graphsage_author']['X_test'], features['graphsage_essay']['X_test']], axis=1)
X_train_1 = features['graphsage_essay']['X_train']
X_dev_1 = features['graphsage_essay']['X_dev']
X_whole_1 = np.concatenate([features['graphsage_essay']['X_train'],
                            features['graphsage_essay']['X_dev']],axis=0)
X_test_1 =  features['graphsage_essay']['X_test']


X_train_2 = features['author_graph_lr']['X_train']
X_dev_2 = features['author_graph_lr']['X_dev']
X_test_2 = features['author_graph_lr']['X_test']
X_whole_2 = np.concatenate([features['author_graph_lr']['X_train'],
                            features['author_graph_lr']['X_dev']],axis=0)

Y_train = uv_list['train_y']
Y_dev = uv_list['dev_y']
Y_whole = np.concatenate([Y_train, Y_dev], axis=0)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis  import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [7]:
__classifiers__ = {
    "lrc": LogisticRegression,
    "knc": KNeighborsClassifier,
    "rfc": RandomForestClassifier,
    "gbc": GradientBoostingClassifier,
    "adac": AdaBoostClassifier,
    "dtc": DecisionTreeClassifier,
    "gnbc": GaussianNB,
    "ldac": LinearDiscriminantAnalysis,
    "qdac": QuadraticDiscriminantAnalysis,
    "svc": SVC,
    "mnbc": MultinomialNB,
}
__classifier_params__ = {
    "lrc": dict(max_iter=400),
    "knc": dict(n_jobs=8),
    "rfc": dict(n_estimators=200, max_depth=32, n_jobs=8),
    "gbc": dict(n_estimators=200),
    "adac": dict(),
    "dtc": dict(n_estimators=200),
    "gnbc": dict(),
    "ldac": dict(),
    "qdac": dict(),
    "svc": dict(kernel='rbf', probability=True),
    "mnbc": dict(alpha=0.01),
}


In [9]:
for name in ["rfc"]:
    clf = __classifiers__[name](**__classifier_params__[name])
    clf.fit(X_train_1, Y_train)
    Y_pred = clf.predict_proba(X_dev_1)[:, 1]
    calculate_score_raw(Y_pred, Y_dev)

KeyboardInterrupt: 