In [1]:
from tqdm import tqdm_notebook as tqdm

from models import TADW, TriDnr
from text_transformers import SBert, LDA, W2V, Sent2Vec, Doc2Vec 
from datasets import Cora, CiteseerM10

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from collections import defaultdict

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package wordnet to /Users/mikhail-
[nltk_data]     makarov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mikhail-makarov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/mikhail-
[nltk_data]     makarov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
candidates = [
    (TriDnr, None, 'TriDnr'),
#     (TADW, SBert, 'TADW + SBert'),
#     (TADW, LDA, 'TADW + LDA'),
#     (TADW, W2V, 'TADW + W2V'),
#     (TADW, Sent2Vec, 'TADW + Sent2Vec'),
#     (TADW, Doc2Vec, 'TADW + Doc2Vec'),
#     (TADW, CountVectorizer, 'TADW + BOW'),
#    (TADW, TfidfVectorizer, 'TADW + TFIDF')
]

In [3]:
ds = CiteseerM10()

In [4]:
d = 160
seeds = [1] # [1, 10, 100]

res = defaultdict(list)
for constr, transf, name in tqdm(candidates, desc='candidates'):
    if transf is not None:
        transformer = transf()
        ds.transform_features(transformer)
        
    data = ds.get_data()
    
    if name != 'TriDnr':
        model = constr(data['graph'], data['features'], dim=d)
        model.learn_embeddings()
    
    for seed in tqdm(seeds, desc='seeds'):
        train_indx, test_indx =  train_test_split(ds.ids, stratify=ds.labels, test_size=0.5, random_state=seed)
        
        if  name == 'TriDnr':
            labels = []
            for i, label in enumerate(data['labels']):
                if i in train_indx:
                    labels.append(label)
                else:
                    labels.append(-1)
                    
            model = constr(data['graph'], data['features'], labels, dim=d)
            model.learn_embeddings()
            
        y = data['labels'].reshape(-1, 1)
        ids = data['ids'].reshape(-1, 1)
        dev_df = pd.DataFrame(np.hstack((ids, model.embeddings, y)),columns=['index']+[f'{i}' for i in range(d)]+['label'])
        dev_df = dev_df.set_index('index')

        train_X, train_y = dev_df.iloc[train_indx,:-1].values, dev_df.iloc[train_indx,-1].values 
        test_X, test_y = dev_df.iloc[test_indx,:-1].values, dev_df.iloc[test_indx,-1].values 

        clf = OneVsRestClassifier(GradientBoostingClassifier())
        clf.fit(train_X, train_y)
        pred_y = clf.predict(test_X)
        f1 = f1_score(test_y, pred_y, average='micro')
        
        print(name, f1)

        res[name].append(f1)

HBox(children=(IntProgress(value=0, description='candidates', max=1, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='seeds', max=1, style=ProgressStyle(description_width='initial…

IndexError: positional indexers are out-of-bounds

In [6]:
res

defaultdict(list,
            {'TADW + SBert': [0.8183161004431314],
             'TADW + LDA': [0.808714918759232],
             'TADW + W2V': [0.8028064992614475],
             'TADW + Sent2Vec': [0.810192023633678],
             'TADW + Doc2Vec': [0.8020679468242247],
             'TADW + BOW': [0.8360413589364845],
             'TADW + TFIDF': [0.829394387001477]})

In [7]:
model = TADW(data['graph'], data['features'], dim=160, lamb=0.2)
model.learn_embeddings()

In [8]:
dev_df = pd.DataFrame(np.hstack((data['ids'].reshape(-1, 1), model.embeddings, data['labels'].reshape(-1, 1))),
             columns=['index'] + [f'x_{i}' for i in range(160)] + ['label'])
dev_df['index'] = dev_df['index'].astype(int)
dev_df['label'] = dev_df['label'].astype(int)
dev_df = dev_df.set_index('index')
dev_df.head()

Unnamed: 0_level_0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_151,x_152,x_153,x_154,x_155,x_156,x_157,x_158,x_159,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.010287,0.09552,0.131457,0.006771,-0.022687,-0.092674,0.033099,-0.023358,0.060377,0.045605,...,-0.067376,0.068518,-0.145466,0.120817,-0.062843,-0.136969,0.209437,0.024365,0.110691,1
1,0.155282,0.038824,-0.136268,0.01588,-0.083128,0.140333,0.055508,0.034682,-0.053863,0.091431,...,0.144738,-0.066099,-0.242651,-0.042949,0.055133,-0.105944,-0.144466,0.105623,-0.127992,2
2,0.22932,0.045439,0.04378,0.065687,-0.192891,0.036342,-0.081255,-0.018805,0.000654,-0.107198,...,0.084271,0.089491,0.102177,0.084382,0.017083,0.033472,0.154587,-0.195677,0.026129,1
3,0.079565,0.114998,0.023563,0.087321,-0.053356,-0.083922,0.22572,-0.097864,0.028489,-0.100142,...,0.048706,-0.032517,-0.218631,-0.05193,-0.129329,-0.039216,0.029626,0.007424,-0.083579,1
4,-0.103364,0.139213,0.024008,-0.082825,0.054482,-0.186933,0.122936,0.118399,0.181015,-0.010883,...,0.033675,0.055739,0.056271,0.15196,-0.017802,-0.031828,-0.057853,-0.082694,0.056314,1


In [9]:
train_indx, test_indx =  train_test_split(dev_df.index, stratify=dev_df['label'], test_size=0.5)

In [10]:
train_X, train_y = dev_df.iloc[train_indx,:-1].values, dev_df.iloc[train_indx,-1].values 
test_X, test_y = dev_df.iloc[test_indx,:-1].values, dev_df.iloc[test_indx,-1].values 

In [11]:
clf = OneVsRestClassifier(LinearSVC(C=15, max_iter=10000))
# clf = OneVsRestClassifier(GradientBoostingClassifier())

clf.fit(train_X, train_y)

OneVsRestClassifier(estimator=LinearSVC(C=15, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [12]:
pred_y = clf.predict(test_X)

In [13]:
accuracy_score(test_y, pred_y)

0.8301329394387001

In [14]:
f1_score(test_y, pred_y, average='micro')

0.8301329394387

In [15]:
test_y

array([2, 1, 0, ..., 6, 3, 0])

In [16]:
pred_y

array([2, 6, 4, ..., 2, 3, 0])

In [17]:
np.unique(pred_y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]), array([148, 207, 397, 201, 126,  99, 176]))

In [18]:
np.unique(test_y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]), array([149, 209, 409, 213, 108,  90, 176]))