# Packages and functions

In [1]:
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [2]:
import pandas as pd

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from pandas_profiling import ProfileReport

def lr(x,y):  
    """ logistic regression"""
    model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
    y_pred = cross_val_predict(model, x, y, cv=10)
    acc = cross_val_score(model, x, y, cv=10, scoring='precision')
    print('MEAN PRECISION', np.mean(acc))
    report = classification_report(y, y_pred)
    print('report:', report, sep='\n')
    return y_pred

def naive_bayes(XX,yy):
    """ Naive bayes"""
    # cross validation
    gb_model = GaussianNB()
    gb_acc = cross_val_score(gb_model, XX, yy, cv=10, scoring='precision')
    print('MEAN PRECISION', np.mean(gb_acc))
    yy_pred = cross_val_predict(gb_model, XX, yy, cv=10)
    gb_report = classification_report(yy, yy_pred)
    print('report:', gb_report, sep='\n')

    return yy_pred
  

def decision_tree(X,y, max_depth=4):
    """ Decision tree"""
    dt_model = DecisionTreeClassifier(max_depth=max_depth, random_state=44)
    acc = cross_val_score(dt_model, X, y, cv=10, scoring='precision')
    print('MEAN PRECISION', np.mean(acc))
    y_pred = cross_val_predict(dt_model, X, y, cv=10)
    report = classification_report(y, y_pred)
    print('report:', report, sep='\n')
    return y_pred

# Prepare data


In [3]:
# TOPICS
df = pd.read_csv("annotated_artists_periods.csv")
dfi= pd.read_csv("annotated_institutions.csv")

## 1. Prepare data for model selection - relations between historians

In [4]:
# TOPICS
d1 = df.copy()
# rename columns
d1.columns = ['art_hist_1', 'art_hist_2', 'period', 'target', 'bio', 'collab','archive_1','mention_1','archive_2','mention_2','mention_both']
# drop columns
d1 = d1[['art_hist_1', 'art_hist_2','bio','target','mention_1','mention_2']]
# remove 0,5
d1 = d1[d1['target'] != "0,5"] 
# merge mentions
d1["mention"] = d1["mention_1"] + d1["mention_2"] 
# replace '' with 0
for col in d1[["target","bio","mention"]]: 
    d1[col] = d1[col].replace('',0)
    d1[col] = d1[col].astype(float)
d1['mention'].values[d1['mention'] > 1] = 1

# group by number of topics
d1series = d1.groupby(["art_hist_1","art_hist_2","target","bio","mention"]).size()
d1 = d1series.to_frame(name = 'size').reset_index()


# INSTITUTIONS
d2 = dfi.copy()
# rename columns
d2.columns = ['art_hist_1', 'art_hist_2', 'institution', 'target', 'scope', 'archive_1','mention_1','archive_2','mention_2','notes']
# drop columns
d2 = d2[['art_hist_1', 'art_hist_2','institution','target','mention_1','mention_2']]
# remove 0,5
d2 = d2[d2['target'] != "0,5"] # remove 0,5
# merge mentions
d2["mention"] = d2["mention_1"] + d2["mention_2"] 
# replace '' with 0
for col in d2[["target","mention"]]: 
  d2[col] = d2[col].replace('',0)
  d2[col] = d2[col].astype(float)
d2['mention'].values[d2['mention'] > 1] = 1
# group by number of institutions
d2series = d2.groupby(["art_hist_1","art_hist_2","target","mention"]).size()
d2 = d2series.to_frame(name = 'size').reset_index()


# INSTITUTIONS AND TOPICS
topics_mentions = d1.copy()
topics_mentions.columns = ["art_hist_1","art_hist_2","target_topic","bio","mention_historian","size_topic"]
instits = d2.copy()
instits.columns = ['art_hist_1','art_hist_2', 'target_inst', 'mention_institution', 'size_inst']
# merge tables based on first two columns (considering pairs may appear in different order)
topics_mentions['hash'] = topics_mentions.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
instits['hash'] = instits.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
df_merged = topics_mentions.merge(instits, how='outer', indicator=True, on=["hash"])
# replace nan
df_merged = df_merged.replace(np.nan, 0, regex=True)
# merge target columns
df_merged["target"] = df_merged["target_topic"] + df_merged["target_inst"] 
df_merged['target'].values[df_merged['target'] > 1] = 1
# drop columns
df_merged = df_merged[['art_hist_1_x','art_hist_2_x', 'target', 'bio','mention_historian', 'size_topic','size_inst']]
df_merged = df_merged.groupby(['art_hist_1_x','art_hist_2_x'], as_index=False).sum()

# 2. Prepare data for model selection - contents of collections

d1coll = df.copy()
# rename columns
d1coll.columns = ['art_hist_1', 'art_hist_2', 'period', 'target', 'bio', 'collab','archive_1','mention_1','archive_2','mention_2','mention_both']
# drop columns
d1coll = d1coll[['art_hist_1','art_hist_2','period','bio','archive_1','archive_2']]
# replace '' with 0
for col in d1coll[["bio",'archive_1','archive_2']]: 
    d1coll[col] = d1coll[col].replace(np.nan,0).replace('',0)
    d1coll[col] = d1coll[col].astype(int)
# merge mentions
d1coll["target"] = d1coll["archive_1"] + d1coll["archive_2"] 
d1coll['target'] = d1coll['target'].astype(int)
d1coll['target'].values[d1coll['target'] > 1] = 1

# group by number of topics
d1collseries = d1coll.groupby(["art_hist_1","art_hist_2",'bio','target']).size()
d1coll = d1collseries.to_frame(name = 'size').reset_index()
# get number of institutions in common
d2['hash'] = d2.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
d1coll['hash'] = d1coll.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
d1coll_merged = d1coll.merge(d2, how='outer', indicator=True, on=["hash"])
d1coll_merged = d1coll_merged[['art_hist_1_x','art_hist_2_x','bio','target_x', 'size_x','size_y']]
d1coll_merged.columns = ['art_hist_1','art_hist_2','bio','target', 'size_topic','size_inst']
d1coll_merged['size_inst'] = d1coll_merged['size_inst'].replace(np.nan,0)
d1coll_merged = d1coll_merged[d1coll_merged.art_hist_1.notnull()]

## EDA


### Artists_periods

In [5]:
df_eda = df.copy()
df_eda.columns = ["art_hist_1", "art_hist_2", "topic", "target", "ref_in_bio", "collab_on_topic", "h2_archive1", "h2_mention", "h1_archive2", "h1_mention", "both_bio"]
df_eda = df_eda[df_eda['target'] != "0,5"] # remove 0,5
for col in df_eda[["target", "ref_in_bio", "collab_on_topic", "h2_archive1", "h2_mention", "h1_archive2", "h1_mention", "both_bio"]]: 
    df_eda[col] = df_eda[col].replace(np.nan,0).replace('',0)
    df_eda[col] = df_eda[col].astype(int)
profiledf = ProfileReport(df_eda, title="Artists and periods", html={'style': {'full_width': True}}, sort="ascending")
#profiledf.to_notebook_iframe()

In [6]:
# unique historians
unique = len(pd.unique(df_eda[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique topics
uniquet = len(pd.unique(df_eda[['topic']].values.ravel()))
# unique pairs
count = len(df_eda.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# valid relations
valid = df_eda.loc[df_eda["target"] >= 1]
# unique historians in valid relations
uniquev =  len(pd.unique(valid[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique pairs in valid relations
countv = len(valid.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# already recorded in bio
valid_in_bio = df_eda.loc[(df_eda["target"] >= 1) & (df_eda["ref_in_bio"] >= 1)]

# how many relations recorded in at least one bio are valid?
bio = df_eda.loc[df_eda["ref_in_bio"] >= 1]
bio_valid = len(valid_in_bio)*100/len(bio)
# how many relations recorded in both bios are valid?
both = df_eda.loc[df_eda["ref_in_bio"] >= 1]
both_bio_valid = len(valid_in_bio)*100/len(both)

# merge mentions
df_eda["mention"] = df_eda["h2_mention"] + df_eda["h1_mention"] 
df_eda['mention'].values[df_eda['mention'] > 1] = 1
# how many relations recorded in at least one archival description are valid?
mention = df_eda.loc[df_eda["mention"] >= 1]
valid_in_mention = df_eda.loc[(df_eda["target"] >= 1) & (df_eda["mention"] >= 1)]
mention_valid = len(valid_in_mention)*100/len(mention)

#unique, uniquet, count , len(valid), uniquev , countv , len(valid_in_bio), bio_valid, both_bio_valid, mention_valid

In [7]:
# valid collaborations on a topic
true_collab = df_eda.loc[(df_eda["target"] >= 1) & (df_eda["collab_on_topic"] >= 1)]

# unique historians
unique = len(pd.unique(true_collab[['art_hist_1', 'art_hist_2']].values.ravel()))

# unique topics
uniquet = len(pd.unique(true_collab[['topic']].values.ravel()))

# unique pairs of historians in valid collaborations on a topic
countv = len(true_collab.groupby(['art_hist_1','art_hist_2']).size().reset_index())

#len(true_collab), unique, uniquet, countv

### Institutions

In [8]:
dfi_eda = dfi.copy()
dfi_eda.columns = ["art_hist_1", "art_hist_2", "institution", "target", "relation", "inst_archive_1", "inst_mention_1", "inst_archive_2", "inst_mention_2", "notes"]
dfi_eda = dfi_eda[dfi_eda['target'] != "0,5"] # remove 0,5
for col in dfi_eda[["target", "inst_archive_1", "inst_mention_1", "inst_archive_2", "inst_mention_2"]]: 
    dfi_eda[col] = dfi_eda[col].replace(np.nan,0).replace('',0).replace('?',0)
    dfi_eda[col] = dfi_eda[col].astype(int)
profiledf = ProfileReport(dfi_eda, title="Institutions", html={'style': {'full_width': True}}, sort="ascending")
#profiledf.to_notebook_iframe()

In [9]:
# unique historians
unique = len(pd.unique(dfi_eda[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique inst
uniquei = len(pd.unique(dfi_eda[['institution']].values.ravel()))
# unique pairs
count = len(dfi_eda.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# valid relations
valid = dfi_eda.loc[dfi_eda["target"] >= 1]
# unique historians in valid relations
uniquev =  len(pd.unique(valid[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique pairs in valid relations
countv = len(valid.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# already recorded in bio
valid_in_bio = df_merged.loc[ (df_merged["bio"] >= 1) & (df_merged["size_inst"] >= 1)]


#unique, uniquei, count , len(valid), uniquev , countv , len(valid_in_bio)

### Merged tables

In [10]:
dfm_eda = df_merged.copy()
for col in dfm_eda[["art_hist_1_x", "art_hist_2_x"]]: 
    dfm_eda[col] = dfm_eda[col].astype(str)
dfm_eda = dfm_eda[dfm_eda["art_hist_1_x"] != "0"]
dfm_eda.drop_duplicates()
profiledfm = ProfileReport(dfm_eda, title="Institutions and topics", html={'style': {'full_width': True}}, sort="ascending")
#profiledfm.to_notebook_iframe()

In [11]:
# relations with at least a topic and an instit in common
both = dfm_eda.loc[(dfm_eda["size_inst"] != 0) & (dfm_eda["size_topic"] != 0)]

# valid relations with at least a topic and an instit in common
both_valid = dfm_eda.loc[(dfm_eda["target"] >= 1) & (dfm_eda["size_inst"] != 0) & (dfm_eda["size_topic"] != 0)]


# valid collaborations on a topic
true_colab = df_eda.loc[(dfm_eda["target"] >= 1) & (df_eda["collab_on_topic"] >= 1)]

# unique historians
unique = len(pd.unique(true_colab[['art_hist_1', 'art_hist_2']].values.ravel()))

# unique topics
uniquet = len(pd.unique(true_colab[['topic']].values.ravel()))

# unique pairs of historians in valid collaborations on a topic
countv = len(true_colab.groupby(['art_hist_1','art_hist_2']).size().reset_index())


#len(both), len(both_valid), len(true_colab)


### Merged tables for collections

In [12]:
# relations with at least a topic 
valid = d1coll_merged.loc[(d1coll_merged["target"] != 0)]
topic = d1coll_merged.loc[(d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
bio = d1coll_merged.loc[(d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0) & (d1coll_merged["bio"] != 0)]
only_bio = d1coll_merged.loc[(d1coll_merged["bio"] != 0)]
valid_in_bio = d1coll_merged.loc[(d1coll_merged["bio"] != 0) & (d1coll_merged["target"] != 0)]
valid_inst = d1coll_merged.loc[(d1coll_merged["size_inst"] != 0) & (d1coll_merged["target"] != 0)]
valid_inst_topic = d1coll_merged.loc[(d1coll_merged["size_inst"] != 0) & (d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
valid_topic = d1coll_merged.loc[(d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
valid_inst_topic_bio = d1coll_merged.loc[(d1coll_merged["size_inst"] != 0) & (d1coll_merged["bio"] != 0) & (d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
#len(valid),len(topic), len(bio), len(only_bio), len(valid_in_bio), len(valid_inst), len(valid_inst_topic), len(valid_topic), len(valid_inst_topic_bio)

# MODEL SELECTION FOR RELATIONS BETWEEN HISTORIANS



## Predict generic relations based on mentions in bio only

**mean p= 0.7, p(1) = 1, r(1)= 0.19 (all)**

note: low support of references in bio. The precision is very high, meaning the relation between any historians mentioned in another historian's bio is 100% relevant. However, due to data completeness issues this aspect is not sufficient to detect actual interactions.

In [13]:
X = d1[['bio']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_bio = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_bio = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_bio = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       101
         1.0       1.00      0.19      0.33        72

    accuracy                           0.66       173
   macro avg       0.82      0.60      0.55       173
weighted avg       0.79      0.66      0.59       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       101
         1.0       1.00      0.19      0.33        72

    accuracy                           0.66       173
   macro avg       0.82      0.60      0.55       173
weighted avg       0.79      0.66      0.59       173


DECISION TREE
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       101
         1.0       1.00      0.19      0.33        72

    accuracy                           0.66       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
# train on merged tables
df_merged['target'].values[df_merged['target'] > 1] = 1
X = df_merged[['bio']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_bio_merged = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_bio_merged = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_bio_merged = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.18      0.31        72

    accuracy                           0.66       173
   macro avg       0.82      0.59      0.54       173
weighted avg       0.78      0.66      0.58       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.18      0.31        72

    accuracy                           0.66       173
   macro avg       0.82      0.59      0.54       173
weighted avg       0.78      0.66      0.58       173


DECISION TREE
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.18      0.31        72

    accuracy                           0.66       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [54]:
results = pd.DataFrame(lr_bio_merged,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 100.0; 
 Unknown 0.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict generic relations based on mentions in archival description only

**mean p= 0.8, p(1) = 1, r(1)= 0.17**

note: same considerations for relations based on references in biographies

In [16]:
X = d1[['mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_arch = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_arch = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_arch = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       173
   macro avg       0.81      0.58      0.53       173
weighted avg       0.78      0.65      0.57       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       173
   macro avg       0.81      0.58      0.53       173
weighted avg       0.78      0.65      0.57       173


DECISION TREE
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p= 0.7, p(1) = 1, r(1)= 0.14**

note: training on everything


In [17]:
X = df_merged[['mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_arch_merged = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_arch_merged = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_arch_merged = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       173
   macro avg       0.81      0.58      0.53       173
weighted avg       0.78      0.65      0.57       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       173
   macro avg       0.81      0.58      0.53       173
weighted avg       0.78      0.65      0.57       173


DECISION TREE
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [18]:
results = pd.DataFrame(lr_arch_merged,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 66.66666666666666; 
 Unknown 33.33333333333333 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
5,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
8,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
11,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on mentions in bio or archival description

**mean p= 0.9, p(1)=1, r(1)=0.25**

note: same considerations for relations based on references in bio/archival descriptions only

In [19]:
X = d1[['bio','mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_biomention = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_biomention = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_biomention = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       173
   macro avg       0.83      0.62      0.59       173
weighted avg       0.80      0.69      0.63       173


NAIVE BAYES
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       173
   macro avg       0.83      0.62      0.59       173
weighted avg       0.80      0.69      0.63       173


DECISION TREE
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p= 0.9, p(1)=1, r(1)=0.22**

note: training on everything

In [20]:
X = df_merged[['bio','mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_biomention_merged = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_biomention_merged = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_biomention_merged = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.24      0.38        72

    accuracy                           0.68       173
   macro avg       0.82      0.62      0.58       173
weighted avg       0.79      0.68      0.62       173


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.24      0.38        72

    accuracy                           0.68       173
   macro avg       0.82      0.62      0.58       173
weighted avg       0.79      0.68      0.62       173


DECISION TREE
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.24      0.38        72

    accuracy                           0.68       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [55]:
results = pd.DataFrame(lr_biomention_merged,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 76.47058823529412; 
 Unknown 23.52941176470588 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
6,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
10,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
14,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on number of topics only

**mean p=0.57, p(1)=0.65 , r(1)=0.18 (dt)**

notes: this feature alone is not suffient to predict an actual interaction happened. We cannot assume that scholars collaborated despite they were studying the same topics.


In [22]:
X = d1[['size']]
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION")
lr_topics = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_topics = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
tree_topics = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.5616666666666668
report:
              precision    recall  f1-score   support

         0.0       0.62      0.87      0.73       101
         1.0       0.59      0.26      0.37        72

    accuracy                           0.62       173
   macro avg       0.61      0.57      0.55       173
weighted avg       0.61      0.62      0.58       173


NAIVE BAYES
MEAN PRECISION 0.61
report:
              precision    recall  f1-score   support

         0.0       0.62      0.90      0.73       101
         1.0       0.62      0.22      0.33        72

    accuracy                           0.62       173
   macro avg       0.62      0.56      0.53       173
weighted avg       0.62      0.62      0.56       173


DECISION TREE
MEAN PRECISION 0.5916666666666666
report:
              precision    recall  f1-score   support

         0.0       0.61      0.93      0.73       101
         1.0       0.61      0.15      0.24        72

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.8, p(1)=0.7 , r(1)=0.3 (dt)**

notes: training on everything

In [23]:
X = df_merged[['size_topic']]
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION")
lr_topics_merged = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
nb_topics_merged = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
tree_topics_merged = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.5616666666666668
report:
              precision    recall  f1-score   support

         0.0       0.62      0.86      0.72       101
         1.0       0.58      0.26      0.36        72

    accuracy                           0.61       173
   macro avg       0.60      0.56      0.54       173
weighted avg       0.60      0.61      0.57       173


NAIVE BAYES


  _warn_prf(average, modifier, msg_start, len(result))


MEAN PRECISION 0.625
report:
              precision    recall  f1-score   support

         0.0       0.62      0.91      0.74       101
         1.0       0.64      0.22      0.33        72

    accuracy                           0.62       173
   macro avg       0.63      0.57      0.53       173
weighted avg       0.63      0.62      0.57       173


DECISION TREE
MEAN PRECISION 0.5916666666666666
report:
              precision    recall  f1-score   support

         0.0       0.61      0.93      0.74       101
         1.0       0.65      0.18      0.28        72

    accuracy                           0.62       173
   macro avg       0.63      0.56      0.51       173
weighted avg       0.63      0.62      0.55       173



  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [56]:
results = pd.DataFrame(tree_topics_merged,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] > 0.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] != 0.0)]


Known 23.076923076923077; 
 Unknown 76.92307692307693 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
2,1.0,1.0,Federico Zeri,Luisa Vertova,1.0
8,1.0,1.0,Roberto Longhi,Stefano Tumidei,1.0
9,1.0,1.0,Stefano Tumidei,Federico Zeri,1.0


## Predict generic relations based on number of topics and mentions in bio

**mean p= 0.8, p(1)=1, r(1)=0.19 (nb)**

notes: the precision is similar to the model considering references in the biography only, while the recall is even lower.

In [25]:
X = d1[['size', 'bio']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_topicsbio = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
nb_topicsbio = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_topicsbio = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7533333333333333
report:
              precision    recall  f1-score   support

         0.0       0.67      0.93      0.78       101
         1.0       0.78      0.35      0.48        72

    accuracy                           0.69       173
   macro avg       0.72      0.64      0.63       173
weighted avg       0.71      0.69      0.65       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       101
         1.0       1.00      0.19      0.33        72

    accuracy                           0.66       173
   macro avg       0.82      0.60      0.55       173
weighted avg       0.79      0.66      0.59       173


DECISION TREE
MEAN PRECISION 0.6483333333333332
report:
              precision    recall  f1-score   support

         0.0       0.64      0.93      0.76       101
         1.0       0.72      0.25      0.37        72

    accuracy          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p= 0.8, p(1)=1, r(1)=0.17 (nb)**

note: training on everything

In [26]:
X = df_merged[['size_topic', 'bio']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_topicsbio_merged = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
nb_topicsbio_merged = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_topicsbio_merged = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7483333333333333
report:
              precision    recall  f1-score   support

         0.0       0.66      0.93      0.77       101
         1.0       0.77      0.32      0.45        72

    accuracy                           0.68       173
   macro avg       0.71      0.63      0.61       173
weighted avg       0.70      0.68      0.64       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.18      0.31        72

    accuracy                           0.66       173
   macro avg       0.82      0.59      0.54       173
weighted avg       0.78      0.66      0.58       173


DECISION TREE
MEAN PRECISION 0.6433333333333333
report:
              precision    recall  f1-score   support

         0.0       0.63      0.93      0.75       101
         1.0       0.71      0.24      0.35        72

    accuracy          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [57]:
results = pd.DataFrame(nb_topicsbio_merged,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 100.0; 
 Unknown 0.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict generic relations based on number of topics and mentions in archival description

**mean p=0.8, p(1)=1, r(1)=0.17**

notes: the precision is similar to the model considering references in the archival description only, while the recall is even lower.

In [28]:
X = d1[['size', 'mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_topicsarch = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
nb_topicsarch = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_topicsarch = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.805
report:
              precision    recall  f1-score   support

         0.0       0.66      0.93      0.77       101
         1.0       0.77      0.32      0.45        72

    accuracy                           0.68       173
   macro avg       0.71      0.63      0.61       173
weighted avg       0.70      0.68      0.64       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       173
   macro avg       0.81      0.58      0.53       173
weighted avg       0.78      0.65      0.57       173


DECISION TREE
MEAN PRECISION 0.505
report:
              precision    recall  f1-score   support

         0.0       0.63      0.93      0.75       101
         1.0       0.71      0.24      0.35        72

    accuracy                           0.64     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.7, p(1)=1, r(1)=0.14**

note: training on everything

In [29]:
X = df_merged[['size_topic', 'mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_topicsarch_merged = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
nb_topicsarch_merged = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_topicsarch_merged = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.755
report:
              precision    recall  f1-score   support

         0.0       0.66      0.93      0.77       101
         1.0       0.77      0.32      0.45        72

    accuracy                           0.68       173
   macro avg       0.71      0.63      0.61       173
weighted avg       0.70      0.68      0.64       173


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       101
         1.0       1.00      0.17      0.29        72

    accuracy                           0.65       173
   macro avg       0.81      0.58      0.53       173
weighted avg       0.78      0.65      0.57       173


DECISION TREE
MEAN PRECISION 0.5549999999999999
report:
              precision    recall  f1-score   support

         0.0       0.63      0.93      0.75       101
         1.0       0.71      0.24      0.35        72

    accuracy                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [59]:
results = pd.DataFrame(nb_topicsarch_merged,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 66.66666666666666; 
 Unknown 33.33333333333333 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
5,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
8,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
11,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on number of topics, mentions in bio or archival description

**mean p=0.9, p(1)=1, r(1)=0.25 (nb)**

notes: similar considerations as in prior models. We can assume the reason why historians are mentioned in others' biographies or archival descriptions is not always due to the amount of topics they share.

In [31]:
X = d1[['size', 'bio','mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_all = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
nb_all = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tre_all = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8400000000000001
report:
              precision    recall  f1-score   support

         0.0       0.67      0.94      0.79       101
         1.0       0.81      0.36      0.50        72

    accuracy                           0.70       173
   macro avg       0.74      0.65      0.64       173
weighted avg       0.73      0.70      0.67       173


NAIVE BAYES
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       173
   macro avg       0.83      0.62      0.59       173
weighted avg       0.80      0.69      0.63       173


DECISION TREE
MEAN PRECISION 0.7466666666666667
report:
              precision    recall  f1-score   support

         0.0       0.64      0.95      0.77       101
         1.0       0.79      0.26      0.40        72

    accuracy          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.9, p(1)=1, r(1)=0.22 (nb)**

note: training on everything

In [32]:
X = df_merged[['size_topic', 'bio','mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_all_merged = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
nb_all_merged = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
tree_all_merged = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7849999999999999
report:
              precision    recall  f1-score   support

         0.0       0.66      0.94      0.78       101
         1.0       0.80      0.33      0.47        72

    accuracy                           0.69       173
   macro avg       0.73      0.64      0.62       173
weighted avg       0.72      0.69      0.65       173


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       101
         1.0       1.00      0.24      0.38        72

    accuracy                           0.68       173
   macro avg       0.82      0.62      0.58       173
weighted avg       0.79      0.68      0.62       173


DECISION TREE
MEAN PRECISION 0.6916666666666667
report:
              precision    recall  f1-score   support

         0.0       0.64      0.95      0.76       101
         1.0       0.78      0.25      0.38        72

    accuracy          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [60]:
results = pd.DataFrame(nb_all_merged,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 76.47058823529412; 
 Unknown 23.52941176470588 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
6,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
10,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
14,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on number of institutions

**mean p = 0.65, p(1)= 0.67, r(1)= 0.91 (nb)**

notes:  

In [34]:
x = d2[['size']]
y = d2['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
lr_inst = lr(x,y)

# naive bayes
print("NAIVE BAYES")
nb_inst = naive_bayes(x,y)

# decision tree
print("\nDECISION TREE")
tree_inst = decision_tree(x,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.6466666666666666
report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        17
         1.0       0.66      0.94      0.78        35

    accuracy                           0.63        52
   macro avg       0.33      0.47      0.39        52
weighted avg       0.44      0.63      0.52        52

NAIVE BAYES
MEAN PRECISION 0.655
report:
              precision    recall  f1-score   support

         0.0       0.25      0.06      0.10        17
         1.0       0.67      0.91      0.77        35

    accuracy                           0.63        52
   macro avg       0.46      0.49      0.43        52
weighted avg       0.53      0.63      0.55        52


DECISION TREE
MEAN PRECISION 0.6399999999999999
report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        17
         1.0       0.65      0.91      0.76        35

    accuracy         

**mean p = 0.83, p(1)= 0.74, r(1)= 0.44 (nb)**

note: training on everything


In [35]:
x = df_merged[['size_inst']]
y = df_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
inst_lr_model = lr(x,y)

# naive bayes
print("NAIVE BAYES")
inst_model = naive_bayes(x,y)

# decision tree
print("\nDECISION TREE")
inst_tree = decision_tree(x,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7675
report:
              precision    recall  f1-score   support

         0.0       0.65      0.90      0.76       101
         1.0       0.70      0.32      0.44        72

    accuracy                           0.66       173
   macro avg       0.67      0.61      0.60       173
weighted avg       0.67      0.66      0.62       173

NAIVE BAYES
MEAN PRECISION 0.39249999999999996
report:
              precision    recall  f1-score   support

         0.0       0.61      0.93      0.74       101
         1.0       0.65      0.18      0.28        72

    accuracy                           0.62       173
   macro avg       0.63      0.56      0.51       173
weighted avg       0.63      0.62      0.55       173


DECISION TREE
MEAN PRECISION 0.7571428571428571
report:
              precision    recall  f1-score   support

         0.0       0.63      0.90      0.74       101
         1.0       0.66      0.26      0.38        72

    accuracy       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [36]:
results = pd.DataFrame(inst_lr_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 45.45454545454545; 
 Unknown 54.54545454545454 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
2,1.0,1.0,Federico Zeri,Ellis Waterhouse,0.0
3,1.0,1.0,Federico Zeri,Ernst Kitzinger,0.0
7,1.0,1.0,Julian Kliemann,Richard Krautheimer,0.0
8,1.0,1.0,Julius S. Held,Federico Zeri,0.0
9,1.0,1.0,Leo Steinberg,John Pope-Hennessy,0.0
11,1.0,1.0,Luigi Salerno,Federico Zeri,0.0
12,1.0,1.0,Richard Krautheimer,Aby Warburg,0.0
17,1.0,1.0,Ulrich Middeldorf,Aby Warburg,0.0
18,1.0,1.0,Wolfgang Lotz,Ernst Steinmann,0.0
19,1.0,1.0,Wolfgang Lotz,Julian Kliemann,0.0


## Predict generic relations based on number of institutions and number of topics

**mean p=0.79, p(1)=0.73, r(1)=0.51 (dt)**



In [62]:
# Merge institutions and artists_periods tables
X = df_merged[['size_inst', 'size_topic']].copy()
y = df_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.6983333333333334
report:
              precision    recall  f1-score   support

         0.0       0.67      0.86      0.75       101
         1.0       0.67      0.40      0.50        72

    accuracy                           0.67       173
   macro avg       0.67      0.63      0.63       173
weighted avg       0.67      0.67      0.65       173

NAIVE BAYES
MEAN PRECISION 0.6199999999999999
report:
              precision    recall  f1-score   support

         0.0       0.63      0.92      0.75       101
         1.0       0.69      0.25      0.37        72

    accuracy                           0.64       173
   macro avg       0.66      0.59      0.56       173
weighted avg       0.66      0.64      0.59       173


DECISION TREE
MEAN PRECISION 0.7138888888888888
report:
              precision    recall  f1-score   support

         0.0       0.66      0.87      0.75       101
         1.0       0.67      0.36      0.47        72

    accu

  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [63]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 42.30769230769231; 
 Unknown 57.692307692307686 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Ellis Waterhouse,0.0
2,1.0,1.0,Federico Zeri,Ernst Kitzinger,0.0
7,1.0,1.0,Julian Kliemann,Richard Krautheimer,0.0
8,1.0,1.0,Julius S. Held,Federico Zeri,0.0
9,1.0,1.0,Kurt Badt,Federico Zeri,0.0
10,1.0,1.0,Leo Steinberg,John Pope-Hennessy,0.0
12,1.0,1.0,Luigi Salerno,Federico Zeri,0.0
13,1.0,1.0,Roberto Longhi,Everett Fahy,0.0
15,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
19,1.0,1.0,Stefano Tumidei,Luisa Vertova,0.0


# MODEL SELECTION FOR RELATIONS BETWEEN COLLECTIONS 


## Predict historians relevant to collections based on their mention in biography

**mean p=0.75, p(1)=0.92, r(1)=0.39 (dt)**

In [39]:
X = d1coll[['bio']].copy()
y = d1coll['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176

NAIVE BAYES
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176


DECISION TREE
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.76, p(1)=0.93, r(1)=0.41 (dt)**

note: train on everything

In [40]:
X = d1coll_merged[['bio']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177

NAIVE BAYES
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177


DECISION TREE
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accu

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [41]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 100.0; 
 Unknown 0.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict historians relevant to collections based on their topics in common

**mean p=0, p(0)=0.82, r(0)=1**

note: performs very well in detecting non relevant historians

In [42]:

X = d1coll[['size']].copy()
y = d1coll['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       145
           1       0.00      0.00      0.00        31

    accuracy                           0.82       176
   macro avg       0.41      0.50      0.45       176
weighted avg       0.68      0.82      0.74       176

NAIVE BAYES
MEAN PRECISION 0.05
report:
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       145
           1       0.10      0.03      0.05        31

    accuracy                           0.78       176
   macro avg       0.46      0.49      0.46       176
weighted avg       0.69      0.78      0.73       176


DECISION TREE
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       145
           1       0.00      0.00      0.00        31

    accuracy                           0.82       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

**mean p=0, p(0)=0.82, r(0)=1 (lr, dt)**


In [43]:
d1coll_merged_ = d1coll_merged.copy()
d1coll_merged_[['size_topic']].values[d1coll_merged_['size_topic'] > 1] = 1 
X = d1coll_merged_[['size_topic']].copy()
y = d1coll_merged_['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90       145
         1.0       0.00      0.00      0.00        32

    accuracy                           0.82       177
   macro avg       0.41      0.50      0.45       177
weighted avg       0.67      0.82      0.74       177

NAIVE BAYES
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.81      0.97      0.88       145
         1.0       0.00      0.00      0.00        32

    accuracy                           0.79       177
   macro avg       0.41      0.48      0.44       177
weighted avg       0.67      0.79      0.72       177


DECISION TREE
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90       145
         1.0       0.00      0.00      0.00        32

    accuracy                           0.82       177

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## Predicted relations

In [44]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100 if len(merged_piu) > 0 else 0
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100 if len(merged_piu) > 0 else 0

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 0; 
 Unknown 0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict historians relevant to collections based on their mention in biography and topics in common

**mean p=0.75, p(1)=0.92, r(1)=0.39 (dt)**

notes: it seems having topics in common does not help to classify better

In [45]:
X = d1coll[['bio','size']].copy()
y = d1coll['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176

NAIVE BAYES
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176


DECISION TREE
MEAN PRECISION 0.65
report:
              precision    recall  f1-score   support

           0       0.87      0.99      0.93       145
           1       0.91      0.32      0.48        31

    accuracy                           0.88       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
X = d1coll_merged[['bio','size_topic']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177

NAIVE BAYES
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177


DECISION TREE
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.93       145
         1.0       0.92      0.38      0.53        32

    accu

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [47]:
results = pd.DataFrame(all_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 100.0; 
 Unknown 0.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict historians relevant to collections based on shared institutions

**mean p=0.2, p(1)=0.6, r(1)=0.09 (lr)**

In [48]:
X = d1coll_merged[['size_inst']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.2
report:
              precision    recall  f1-score   support

         0.0       0.83      0.99      0.90       145
         1.0       0.60      0.09      0.16        32

    accuracy                           0.82       177
   macro avg       0.72      0.54      0.53       177
weighted avg       0.79      0.82      0.77       177

NAIVE BAYES
MEAN PRECISION 0.15833333333333333
report:
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88       145
         1.0       0.33      0.12      0.18        32

    accuracy                           0.80       177
   macro avg       0.58      0.53      0.53       177
weighted avg       0.74      0.80      0.76       177


DECISION TREE
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.82      0.99      0.89       145
         1.0       0.00      0.00      0.00        32

    accuracy                         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## Predicted relations

In [49]:
results = pd.DataFrame(all_lr_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100
print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 66.66666666666666; 
 Unknown 33.33333333333333 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Leo Steinberg,Everett Fahy,0.0


## Predict historians relevant to collections based on their topics in common and institutions

**mean p=0.5, p(1)=1, r(1)=0.19 (dt)**


In [50]:
X = d1coll_merged[['size_topic','size_inst']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.2
report:
              precision    recall  f1-score   support

         0.0       0.83      0.99      0.90       145
         1.0       0.60      0.09      0.16        32

    accuracy                           0.82       177
   macro avg       0.72      0.54      0.53       177
weighted avg       0.79      0.82      0.77       177

NAIVE BAYES
MEAN PRECISION 0.32
report:
              precision    recall  f1-score   support

         0.0       0.84      0.95      0.89       145
         1.0       0.42      0.16      0.23        32

    accuracy                           0.81       177
   macro avg       0.63      0.55      0.56       177
weighted avg       0.76      0.81      0.77       177


DECISION TREE
MEAN PRECISION 0.5
report:
              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92       145
         1.0       1.00      0.19      0.32        32

    accuracy                           0.85       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [51]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] >= 1.0) & (results["target"] >= 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()
merged_piu = merged_piu.dropna()
known = len(merged_piu.loc[(merged_piu["bio"] >= 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100
print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 83.33333333333334; 
 Unknown 16.666666666666664 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
3,1.0,1.0,Luigi Salerno,Federico Zeri,0.0
