In [1]:
from tqdm import tqdm_notebook as tqdm

from models import TADW
from datasets import Cora

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from collections import defaultdict

In [2]:
candidates = [
    (TADW, CountVectorizer, 'TADW + BOW'),
    (TADW, TfidfVectorizer, 'TADW + TFIDF')
]

In [3]:
ds = Cora()

In [4]:
d = 160
seeds = [1, 10, 100]

res = defaultdict(list)
for seed in tqdm(seeds, desc='seeds'):
    train_indx, test_indx =  train_test_split(ds.ids, stratify=ds.labels, test_size=0.5, random_state=seed)
    for constr, transf, name in tqdm(candidates, desc='candidates'):
        transformer = transf()
        ds.transform_features(transformer)
        data = ds.get_data()

        model = constr(data['graph'], data['features'], dim=d)
        model.learn_embeddings()

        y = data['labels'].reshape(-1, 1)
        ids = data['ids'].reshape(-1, 1)
        dev_df = pd.DataFrame(np.hstack((ids, model.embeddings, y)),columns=['index']+[f'{i}' for i in range(d)]+['label'])
        dev_df = dev_df.set_index('index')

        train_X, train_y = dev_df.iloc[train_indx,:-1].values, dev_df.iloc[train_indx,-1].values 
        test_X, test_y = dev_df.iloc[test_indx,:-1].values, dev_df.iloc[test_indx,-1].values 

        clf = OneVsRestClassifier(GradientBoostingClassifier())
        clf.fit(train_X, train_y)
        pred_y = clf.predict(test_X)
        f1 = f1_score(test_y, pred_y, average='micro')
        
        res[name].append(f1)

HBox(children=(IntProgress(value=0, description='seeds', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='candidates', max=2, style=ProgressStyle(description_width='in…

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


HBox(children=(IntProgress(value=0, description='candidates', max=2, style=ProgressStyle(description_width='in…

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


HBox(children=(IntProgress(value=0, description='candidates', max=2, style=ProgressStyle(description_width='in…

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):





In [5]:
res

defaultdict(list,
            {'TADW + BOW': [0.8419497784342688,
              0.8478581979320532,
              0.8330871491875923],
             'TADW + TFIDF': [0.8323485967503693,
              0.8382570162481536,
              0.8478581979320532]})

In [6]:
model = TADW(data['graph'], data['features'], dim=160, lamb=0.2)
model.learn_embeddings()

Iteration  0
Iteration  1
Iteration  2
Iteration  3
Iteration  4
Iteration  5
Iteration  6
Iteration  7
Iteration  8
Iteration  9
Iteration  10
Iteration  11
Iteration  12
Iteration  13
Iteration  14
Iteration  15
Iteration  16
Iteration  17
Iteration  18
Iteration  19


In [7]:
dev_df = pd.DataFrame(np.hstack((data['ids'].reshape(-1, 1), model.embeddings, data['labels'].reshape(-1, 1))),
             columns=['index'] + [f'x_{i}' for i in range(160)] + ['label'])
dev_df['index'] = dev_df['index'].astype(int)
dev_df['label'] = dev_df['label'].astype(int)
dev_df = dev_df.set_index('index')
dev_df.head()

Unnamed: 0_level_0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_151,x_152,x_153,x_154,x_155,x_156,x_157,x_158,x_159,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.060713,0.151958,-0.041733,0.017015,0.038782,0.03538,0.201292,-0.057736,0.08912,0.024313,...,0.054816,0.082247,0.013817,0.304065,-0.114688,0.047508,0.014961,-0.000592,0.040578,1
1,0.287918,0.213673,0.134863,0.08904,-0.238716,0.0646,-0.067218,0.041489,0.208987,-0.016592,...,0.121426,0.125612,-0.076305,0.015349,0.131498,-0.182803,0.185406,-0.103876,0.053189,2
2,0.099785,0.138385,-0.013874,-0.115157,-0.119726,-0.005885,0.078712,0.087181,0.042229,-0.003713,...,0.048688,0.089827,-0.026748,-0.052219,-0.066039,-0.043265,-0.012229,0.03374,0.04667,1
3,0.040776,0.012628,0.157134,0.218746,0.0063,-0.025235,-0.042563,0.037129,0.083585,-0.036305,...,-0.087176,0.061044,0.007459,-0.030205,-0.135795,0.253941,-0.103839,0.116213,-0.050361,1
4,0.209236,0.139003,0.06477,0.067832,-0.025583,-0.040485,0.151339,-0.086564,0.115224,0.038109,...,0.102454,-0.268836,0.192754,0.145282,-0.047852,0.165609,0.063352,-0.102488,-0.234898,1


In [8]:
train_indx, test_indx =  train_test_split(dev_df.index, stratify=dev_df['label'], test_size=0.5)

In [9]:
train_X, train_y = dev_df.iloc[train_indx,:-1].values, dev_df.iloc[train_indx,-1].values 
test_X, test_y = dev_df.iloc[test_indx,:-1].values, dev_df.iloc[test_indx,-1].values 

In [10]:
clf = OneVsRestClassifier(LinearSVC(C=15, max_iter=10000))
# clf = OneVsRestClassifier(GradientBoostingClassifier())

clf.fit(train_X, train_y)

OneVsRestClassifier(estimator=LinearSVC(C=15, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [11]:
pred_y = clf.predict(test_X)

In [18]:
accuracy_score(test_y, pred_y)

0.8382570162481536

In [13]:
f1_score(test_y, pred_y, average='micro')

0.8382570162481536

In [14]:
test_y

array([3, 2, 4, ..., 2, 5, 6])

In [15]:
pred_y

array([3, 2, 1, ..., 2, 5, 6])

In [16]:
np.unique(pred_y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]), array([141, 210, 420, 223,  91,  95, 174]))

In [17]:
np.unique(test_y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]), array([149, 209, 409, 213, 109,  90, 175]))