In [1]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd

import re

## Features 

In [2]:
import re

indexes = []
features = []
with open('../homework2_exercise2/data/features', 'r') as f:
    for line in f.readlines():
        index, *feat = line.split(':')
        feat = ''.join(feat)
        feat = re.findall('"([^\"]*)"', feat)
        indexes.append(index)
    
        
        feat += [f.split('_')[0] for f in feat]
        feat = list(set(feat))
        
        features.append(feat)

In [3]:
one_hot = MultiLabelBinarizer()
mhe = one_hot.fit_transform(features)
df = pd.DataFrame(mhe)

In [4]:
df.rename(columns=lambda x: f'feat_{x}', inplace=True)
df = df.T.drop_duplicates().T
df = df.astype(bool)
df['theorem'] = indexes

In [5]:
del mhe
del features
del indexes

# Dataset

## Ones

In [6]:
indexes_train = []
premises = []

ones = []

with open('../homework2_exercise2/data/dependencies_train', 'r') as f:
    for line in f.readlines():
        index, *prem = line.split(':')
        prem = ''.join(prem).split()
        
        indexes_train.append(index)
        premises.append(prem)
        
        for p in prem:
            ones.append((index, p))

In [7]:
ones_df = pd.DataFrame(ones, columns=['theorem', 'premise']).drop_duplicates()

In [8]:
del ones
del premises
del indexes_train

## Zeros

In [9]:
chronology = pd.read_csv('../homework2_exercise2/data/chronology_backup', squeeze=True, names=['chronology'])
def get_available_theorems(name):
    return chronology.loc[:(chronology == name).argmax()-1]

In [10]:
used_premises = ones_df.groupby('theorem')['premise'].unique()

In [11]:
FALSE_EXAMPLES_RATIO = 2

dataset_zeros = []
for theorem, theorem_premises in ones_df.groupby('theorem')['premise']:
    available_theorems = get_available_theorems(theorem)
    zero_premises = np.setdiff1d(available_theorems, theorem_premises)
    
    zeros_chosen = zero_premises[np.random.randint(0, len(zero_premises), FALSE_EXAMPLES_RATIO * theorem_premises.shape[0])]
    dataset_zeros += list(zip(
        [theorem] * len(zeros_chosen),
        zeros_chosen
    ))

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
zeros_df = pd.DataFrame(dataset_zeros, columns=['theorem', 'premise'])

In [13]:
del dataset_zeros
del used_premises
del chronology

##  Merging

In [14]:
zeros_df['useful'] = 0
ones_df['useful'] = 1
df_train = pd.concat([zeros_df, ones_df])

In [15]:
del zeros_df
del ones_df

In [16]:
df_train = pd.merge(
    df_train, df,
    left_on=['theorem'],
    right_on=['theorem']
)
df_train = pd.merge(
    df_train, df,
    left_on=['premise'],
    right_on=['theorem'],
    suffixes=('_prem', '_thm')
)
df_train.head()

Unnamed: 0,theorem_prem,premise,useful,feat_0_prem,feat_1_prem,feat_2_prem,feat_3_prem,feat_4_prem,feat_6_prem,feat_7_prem,...,feat_10353_thm,feat_10355_thm,feat_10358_thm,feat_10361_thm,feat_10362_thm,feat_10365_thm,feat_10368_thm,feat_10373_thm,feat_10376_thm,theorem_thm
0,t100_tmap_1,rc5_ordinal1,0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,rc5_ordinal1
1,t10_tops_2,rc5_ordinal1,0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,rc5_ordinal1
2,t13_tex_2,rc5_ordinal1,0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,rc5_ordinal1
3,t175_funct_2,rc5_ordinal1,0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,rc5_ordinal1
4,t19_waybel_9,rc5_ordinal1,0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,rc5_ordinal1


# Training 

In [17]:
import lightgbm as lgb

In [18]:
# train test split
from sklearn.model_selection import train_test_split
train_theorems, test_theorems = train_test_split(df_train['theorem_thm'].unique(), shuffle=True)

train = df_train['theorem_thm'].isin(train_theorems)
test = df_train['theorem_thm'].isin(test_theorems)

features = list(df_train.columns.drop(['theorem_prem', 'theorem_thm', 'premise', 'useful']))
y_name = 'useful'

In [20]:
train_data = lgb.Dataset(df_train[train][features], df_train[train][y_name],
                        categorical_feature=features,
                         free_raw_data=False
                        )
test_data = lgb.Dataset(df_train[test][features], df_train[test][y_name],
                        categorical_feature=features,
                        free_raw_data=False
                        )

In [22]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 1,
    'num_boost_round': 2000,
    'early_stopping_rounds': 100
}
model = lgb.train(
    parameters,
    train_data,
    valid_sets=test_data
)

[1]	valid_0's auc: 0.632776
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.668865
[3]	valid_0's auc: 0.674691
[4]	valid_0's auc: 0.696467
[5]	valid_0's auc: 0.710521
[6]	valid_0's auc: 0.721161
[7]	valid_0's auc: 0.717432
[8]	valid_0's auc: 0.724664
[9]	valid_0's auc: 0.721192
[10]	valid_0's auc: 0.716494
[11]	valid_0's auc: 0.719734
[12]	valid_0's auc: 0.720286
[13]	valid_0's auc: 0.731049
[14]	valid_0's auc: 0.725743
[15]	valid_0's auc: 0.730484
[16]	valid_0's auc: 0.732694
[17]	valid_0's auc: 0.736616
[18]	valid_0's auc: 0.736883
[19]	valid_0's auc: 0.741997
[20]	valid_0's auc: 0.741361
[21]	valid_0's auc: 0.741125
[22]	valid_0's auc: 0.743966
[23]	valid_0's auc: 0.747668
[24]	valid_0's auc: 0.749079
[25]	valid_0's auc: 0.749175
[26]	valid_0's auc: 0.751164
[27]	valid_0's auc: 0.751807
[28]	valid_0's auc: 0.752354
[29]	valid_0's auc: 0.751807
[30]	valid_0's auc: 0.751994
[31]	valid_0's auc: 0.755815
[32]	valid_0's auc: 0.758054
[33]	valid_0's auc

[277]	valid_0's auc: 0.835288
[278]	valid_0's auc: 0.835408
[279]	valid_0's auc: 0.835563
[280]	valid_0's auc: 0.835623
[281]	valid_0's auc: 0.835754
[282]	valid_0's auc: 0.836102
[283]	valid_0's auc: 0.836446
[284]	valid_0's auc: 0.83659
[285]	valid_0's auc: 0.836783
[286]	valid_0's auc: 0.83691
[287]	valid_0's auc: 0.836997
[288]	valid_0's auc: 0.837293
[289]	valid_0's auc: 0.837337
[290]	valid_0's auc: 0.837629
[291]	valid_0's auc: 0.837645
[292]	valid_0's auc: 0.837691
[293]	valid_0's auc: 0.837632
[294]	valid_0's auc: 0.837705
[295]	valid_0's auc: 0.837537
[296]	valid_0's auc: 0.837579
[297]	valid_0's auc: 0.837667
[298]	valid_0's auc: 0.837701
[299]	valid_0's auc: 0.837912
[300]	valid_0's auc: 0.838049
[301]	valid_0's auc: 0.838083
[302]	valid_0's auc: 0.838333
[303]	valid_0's auc: 0.838281
[304]	valid_0's auc: 0.8384
[305]	valid_0's auc: 0.838593
[306]	valid_0's auc: 0.838788
[307]	valid_0's auc: 0.838896
[308]	valid_0's auc: 0.83909
[309]	valid_0's auc: 0.839108
[310]	valid_0's

[556]	valid_0's auc: 0.853948
[557]	valid_0's auc: 0.854058
[558]	valid_0's auc: 0.854155
[559]	valid_0's auc: 0.854042
[560]	valid_0's auc: 0.854122
[561]	valid_0's auc: 0.854217
[562]	valid_0's auc: 0.85439
[563]	valid_0's auc: 0.8547
[564]	valid_0's auc: 0.854916
[565]	valid_0's auc: 0.855053
[566]	valid_0's auc: 0.855128
[567]	valid_0's auc: 0.855199
[568]	valid_0's auc: 0.855256
[569]	valid_0's auc: 0.855331
[570]	valid_0's auc: 0.855398
[571]	valid_0's auc: 0.855451
[572]	valid_0's auc: 0.85553
[573]	valid_0's auc: 0.855686
[574]	valid_0's auc: 0.855723
[575]	valid_0's auc: 0.855761
[576]	valid_0's auc: 0.855818
[577]	valid_0's auc: 0.855814
[578]	valid_0's auc: 0.855841
[579]	valid_0's auc: 0.855829
[580]	valid_0's auc: 0.855873
[581]	valid_0's auc: 0.855965
[582]	valid_0's auc: 0.855874
[583]	valid_0's auc: 0.855826
[584]	valid_0's auc: 0.85562
[585]	valid_0's auc: 0.855588
[586]	valid_0's auc: 0.85546
[587]	valid_0's auc: 0.855383
[588]	valid_0's auc: 0.855237
[589]	valid_0's 

[840]	valid_0's auc: 0.863284
[841]	valid_0's auc: 0.863297
[842]	valid_0's auc: 0.863466
[843]	valid_0's auc: 0.86348
[844]	valid_0's auc: 0.863489
[845]	valid_0's auc: 0.863465
[846]	valid_0's auc: 0.863554
[847]	valid_0's auc: 0.863527
[848]	valid_0's auc: 0.863521
[849]	valid_0's auc: 0.863476
[850]	valid_0's auc: 0.863543
[851]	valid_0's auc: 0.863595
[852]	valid_0's auc: 0.863628
[853]	valid_0's auc: 0.863645
[854]	valid_0's auc: 0.863701
[855]	valid_0's auc: 0.863763
[856]	valid_0's auc: 0.863726
[857]	valid_0's auc: 0.863708
[858]	valid_0's auc: 0.863773
[859]	valid_0's auc: 0.863797
[860]	valid_0's auc: 0.863857
[861]	valid_0's auc: 0.86395
[862]	valid_0's auc: 0.863995
[863]	valid_0's auc: 0.864016
[864]	valid_0's auc: 0.864
[865]	valid_0's auc: 0.864034
[866]	valid_0's auc: 0.864037
[867]	valid_0's auc: 0.864041
[868]	valid_0's auc: 0.863981
[869]	valid_0's auc: 0.863983
[870]	valid_0's auc: 0.864019
[871]	valid_0's auc: 0.864047
[872]	valid_0's auc: 0.864038
[873]	valid_0's

[1116]	valid_0's auc: 0.869252
[1117]	valid_0's auc: 0.869261
[1118]	valid_0's auc: 0.869289
[1119]	valid_0's auc: 0.869301
[1120]	valid_0's auc: 0.869314
[1121]	valid_0's auc: 0.869374
[1122]	valid_0's auc: 0.869387
[1123]	valid_0's auc: 0.869413
[1124]	valid_0's auc: 0.869387
[1125]	valid_0's auc: 0.869372
[1126]	valid_0's auc: 0.869362
[1127]	valid_0's auc: 0.869465
[1128]	valid_0's auc: 0.869495
[1129]	valid_0's auc: 0.86942
[1130]	valid_0's auc: 0.869423
[1131]	valid_0's auc: 0.869432
[1132]	valid_0's auc: 0.869467
[1133]	valid_0's auc: 0.869434
[1134]	valid_0's auc: 0.869454
[1135]	valid_0's auc: 0.869553
[1136]	valid_0's auc: 0.869584
[1137]	valid_0's auc: 0.869609
[1138]	valid_0's auc: 0.869684
[1139]	valid_0's auc: 0.869702
[1140]	valid_0's auc: 0.869756
[1141]	valid_0's auc: 0.86979
[1142]	valid_0's auc: 0.869864
[1143]	valid_0's auc: 0.869784
[1144]	valid_0's auc: 0.869826
[1145]	valid_0's auc: 0.869925
[1146]	valid_0's auc: 0.870004
[1147]	valid_0's auc: 0.870005
[1148]	val

In [22]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.65,
    'bagging_fraction': 0.65,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    ''
    'verbose': 1,
    'num_boost_round': 2000,
    'early_stopping_rounds': 100
}
model = lgb.train(
    parameters,
    train_data,
    valid_sets=test_data
)

[1]	valid_0's auc: 0.632776
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.668865
[3]	valid_0's auc: 0.674691
[4]	valid_0's auc: 0.696467
[5]	valid_0's auc: 0.710521
[6]	valid_0's auc: 0.721161
[7]	valid_0's auc: 0.717432
[8]	valid_0's auc: 0.724664
[9]	valid_0's auc: 0.721192
[10]	valid_0's auc: 0.716494
[11]	valid_0's auc: 0.719734
[12]	valid_0's auc: 0.720286
[13]	valid_0's auc: 0.731049
[14]	valid_0's auc: 0.725743
[15]	valid_0's auc: 0.730484
[16]	valid_0's auc: 0.732694
[17]	valid_0's auc: 0.736616
[18]	valid_0's auc: 0.736883
[19]	valid_0's auc: 0.741997
[20]	valid_0's auc: 0.741361
[21]	valid_0's auc: 0.741125
[22]	valid_0's auc: 0.743966
[23]	valid_0's auc: 0.747668
[24]	valid_0's auc: 0.749079
[25]	valid_0's auc: 0.749175
[26]	valid_0's auc: 0.751164
[27]	valid_0's auc: 0.751807
[28]	valid_0's auc: 0.752354
[29]	valid_0's auc: 0.751807
[30]	valid_0's auc: 0.751994
[31]	valid_0's auc: 0.755815
[32]	valid_0's auc: 0.758054
[33]	valid_0's auc

[277]	valid_0's auc: 0.835288
[278]	valid_0's auc: 0.835408
[279]	valid_0's auc: 0.835563
[280]	valid_0's auc: 0.835623
[281]	valid_0's auc: 0.835754
[282]	valid_0's auc: 0.836102
[283]	valid_0's auc: 0.836446
[284]	valid_0's auc: 0.83659
[285]	valid_0's auc: 0.836783
[286]	valid_0's auc: 0.83691
[287]	valid_0's auc: 0.836997
[288]	valid_0's auc: 0.837293
[289]	valid_0's auc: 0.837337
[290]	valid_0's auc: 0.837629
[291]	valid_0's auc: 0.837645
[292]	valid_0's auc: 0.837691
[293]	valid_0's auc: 0.837632
[294]	valid_0's auc: 0.837705
[295]	valid_0's auc: 0.837537
[296]	valid_0's auc: 0.837579
[297]	valid_0's auc: 0.837667
[298]	valid_0's auc: 0.837701
[299]	valid_0's auc: 0.837912
[300]	valid_0's auc: 0.838049
[301]	valid_0's auc: 0.838083
[302]	valid_0's auc: 0.838333
[303]	valid_0's auc: 0.838281
[304]	valid_0's auc: 0.8384
[305]	valid_0's auc: 0.838593
[306]	valid_0's auc: 0.838788
[307]	valid_0's auc: 0.838896
[308]	valid_0's auc: 0.83909
[309]	valid_0's auc: 0.839108
[310]	valid_0's

[556]	valid_0's auc: 0.853948
[557]	valid_0's auc: 0.854058
[558]	valid_0's auc: 0.854155
[559]	valid_0's auc: 0.854042
[560]	valid_0's auc: 0.854122
[561]	valid_0's auc: 0.854217
[562]	valid_0's auc: 0.85439
[563]	valid_0's auc: 0.8547
[564]	valid_0's auc: 0.854916
[565]	valid_0's auc: 0.855053
[566]	valid_0's auc: 0.855128
[567]	valid_0's auc: 0.855199
[568]	valid_0's auc: 0.855256
[569]	valid_0's auc: 0.855331
[570]	valid_0's auc: 0.855398
[571]	valid_0's auc: 0.855451
[572]	valid_0's auc: 0.85553
[573]	valid_0's auc: 0.855686
[574]	valid_0's auc: 0.855723
[575]	valid_0's auc: 0.855761
[576]	valid_0's auc: 0.855818
[577]	valid_0's auc: 0.855814
[578]	valid_0's auc: 0.855841
[579]	valid_0's auc: 0.855829
[580]	valid_0's auc: 0.855873
[581]	valid_0's auc: 0.855965
[582]	valid_0's auc: 0.855874
[583]	valid_0's auc: 0.855826
[584]	valid_0's auc: 0.85562
[585]	valid_0's auc: 0.855588
[586]	valid_0's auc: 0.85546
[587]	valid_0's auc: 0.855383
[588]	valid_0's auc: 0.855237
[589]	valid_0's 

[840]	valid_0's auc: 0.863284
[841]	valid_0's auc: 0.863297
[842]	valid_0's auc: 0.863466
[843]	valid_0's auc: 0.86348
[844]	valid_0's auc: 0.863489
[845]	valid_0's auc: 0.863465
[846]	valid_0's auc: 0.863554
[847]	valid_0's auc: 0.863527
[848]	valid_0's auc: 0.863521
[849]	valid_0's auc: 0.863476
[850]	valid_0's auc: 0.863543
[851]	valid_0's auc: 0.863595
[852]	valid_0's auc: 0.863628
[853]	valid_0's auc: 0.863645
[854]	valid_0's auc: 0.863701
[855]	valid_0's auc: 0.863763
[856]	valid_0's auc: 0.863726
[857]	valid_0's auc: 0.863708
[858]	valid_0's auc: 0.863773
[859]	valid_0's auc: 0.863797
[860]	valid_0's auc: 0.863857
[861]	valid_0's auc: 0.86395
[862]	valid_0's auc: 0.863995
[863]	valid_0's auc: 0.864016
[864]	valid_0's auc: 0.864
[865]	valid_0's auc: 0.864034
[866]	valid_0's auc: 0.864037
[867]	valid_0's auc: 0.864041
[868]	valid_0's auc: 0.863981
[869]	valid_0's auc: 0.863983
[870]	valid_0's auc: 0.864019
[871]	valid_0's auc: 0.864047
[872]	valid_0's auc: 0.864038
[873]	valid_0's

[1116]	valid_0's auc: 0.869252
[1117]	valid_0's auc: 0.869261
[1118]	valid_0's auc: 0.869289
[1119]	valid_0's auc: 0.869301
[1120]	valid_0's auc: 0.869314
[1121]	valid_0's auc: 0.869374
[1122]	valid_0's auc: 0.869387
[1123]	valid_0's auc: 0.869413
[1124]	valid_0's auc: 0.869387
[1125]	valid_0's auc: 0.869372
[1126]	valid_0's auc: 0.869362
[1127]	valid_0's auc: 0.869465
[1128]	valid_0's auc: 0.869495
[1129]	valid_0's auc: 0.86942
[1130]	valid_0's auc: 0.869423
[1131]	valid_0's auc: 0.869432
[1132]	valid_0's auc: 0.869467
[1133]	valid_0's auc: 0.869434
[1134]	valid_0's auc: 0.869454
[1135]	valid_0's auc: 0.869553
[1136]	valid_0's auc: 0.869584
[1137]	valid_0's auc: 0.869609
[1138]	valid_0's auc: 0.869684
[1139]	valid_0's auc: 0.869702
[1140]	valid_0's auc: 0.869756
[1141]	valid_0's auc: 0.86979
[1142]	valid_0's auc: 0.869864
[1143]	valid_0's auc: 0.869784
[1144]	valid_0's auc: 0.869826
[1145]	valid_0's auc: 0.869925
[1146]	valid_0's auc: 0.870004
[1147]	valid_0's auc: 0.870005
[1148]	val

In [39]:
preds = model.predict(df_X)

In [42]:
preds

array([0.48027697, 0.36631135, 0.17748185, ..., 0.83642001, 0.83480839,
       0.83180305])

In [43]:
from sklearn.metrics import roc_auc_score

In [46]:
roc_auc_score(df_y, preds)

0.8991875557875608

In [2]:
from sklearn.model_selection import train_test_split

In [4]:
import pandas as pd

In [11]:
df = pd.DataFrame([[1,2], [1,3],[3,5], [2,4]])
df.head()

Unnamed: 0,0,1
0,1,2
1,1,3
2,3,5
3,2,4


In [26]:
split = train_test_split(df[0].unique(), shuffle=True)

In [27]:
df[df[0].isin(split[0])]

Unnamed: 0,0,1
2,3,5
3,2,4


In [28]:
df

Unnamed: 0,0,1
0,1,2
1,1,3
2,3,5
3,2,4
