# Packages and functions

In [None]:
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Downloading https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
[K     - 34.6 MB 661 kB/s
Collecting pydantic>=1.8.1
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 5.2 MB/s 
[?25hCollecting PyYAML>=5.0.0
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 31.0 MB/s 
Collecting visions[type_image_path]==0.7.1
  Downloading visions-0.7.1-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 55.1 MB/s 
Collecting htmlmin>=0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
Collecting phik>=0.11.1
  Downloading phik-0.12.0-cp37-cp37m-manylinux2010_x86_64.whl (675 kB)
[K     |████████████████████████████████| 675 kB 35.4 MB/s 
[?25hCollecting tangled-up-in-unicode==0.1.0
  Downloading tangled_up_in_unicode-0.1.0-py3-non

In [None]:
from google.colab import auth
import gspread
from oauth2client.client import GoogleCredentials
import pandas as pd

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from pandas_profiling import ProfileReport

def lr(x,y):
  #X_train , X_val , y_train , y_val = train_test_split(x, y, random_state = 44)
  model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
  #model.fit(X_train,y_train)
  #p_pred = model.predict_proba(X_train)
  #y_pred = model.predict(X_val)
  #score_ = model.score(X_val, y_val)
  #print('score_:', score_, end='\n\n')
  #conf_m = confusion_matrix(y_val, y_pred)
  #print('conf_m:', conf_m, sep='\n', end='\n\n')
  y_pred = cross_val_predict(model, x, y, cv=10)
  acc = cross_val_score(model, x, y, cv=10, scoring='precision')
  print('MEAN PRECISION', np.mean(acc))
  report = classification_report(y, y_pred)
  print('report:', report, sep='\n')

  return y_pred

def naive_bayes(XX,yy):
  # kf = KFold(n_splits=5, random_state=None, shuffle=False)
  # for train_index, test_index in kf.split(X):
  #   print('TRAIN:', train_index, 'TEST:', test_index)
  #   X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
  #   y_train , y_test = y[train_index] , y[test_index]

  #   #Train the model
  #   gb_model = GaussianNB()
  #   gb_model.fit(X_train,y_train) #Training the model
  #   print(f"Accuracy for the fold no. {i} on the test set: {accuracy_score(y_test, gb_model.predict(X_test))}")

  # y_pred = gb_model.predict(X_test)
  # score_ = accuracy_score(y_test, y_pred )
  # print('score_:', score_, end='\n\n')
  # conf_m = confusion_matrix(y_test, y_pred)
  # print('conf_m:', conf_m, sep='\n', end='\n\n')
  # report = classification_report(y_test, y_pred)
  # print('report:', report, sep='\n')
  # confusion_matrix(y_test, y_pred)
  
  # cross validation
  gb_model = GaussianNB()
  gb_acc = cross_val_score(gb_model, XX, yy, cv=10, scoring='precision')
  print('MEAN PRECISION', np.mean(gb_acc))
  yy_pred = cross_val_predict(gb_model, XX, yy, cv=10)
  gb_report = classification_report(yy, yy_pred)
  print('report:', gb_report, sep='\n')

  return yy_pred
  

def decision_tree(X,y, max_depth=4):
  # X_train , X_val , y_train , y_val = train_test_split(X, y, random_state = 44)
  dt_model = DecisionTreeClassifier(max_depth=max_depth, random_state=44)
  # dt_model.fit(X_train,y_train)
  # y_pred = dt_model.predict(X_val)
  # score_ = accuracy_score(y_val, y_pred)
  # print('score_:', score_, end='\n\n')
  # conf_m = confusion_matrix(y_val, y_pred)
  # print('conf_m:', conf_m, sep='\n', end='\n\n')
  
  acc = cross_val_score(dt_model, X, y, cv=10, scoring='precision')
  print('MEAN PRECISION', np.mean(acc))
  y_pred = cross_val_predict(dt_model, X, y, cv=10)
  report = classification_report(y, y_pred)
  print('report:', report, sep='\n')
  return y_pred

# Prepare data


In [None]:
# Parse google spreadsheet

# authenticate
auth.authenticate_user()

# spreadsheet: Research topics
gc = gspread.authorize(GoogleCredentials.get_application_default())
spreadsheet = gc.open('artists_periods')
topics = spreadsheet.get_worksheet(0)

spreadsheet2 = gc.open('institutions')
institutions = spreadsheet2.get_worksheet(0)

# Transform tables to dataframes. 

def make_header(df):
  new_header = df.iloc[0] 
  df = df[1:] 
  df.columns = new_header
  return df

# TOPICS
rows = topics.get_all_values()
df = pd.DataFrame.from_records(rows)
df = make_header(df)

# INSTITUTIONS
rows_i = institutions.get_all_values()
dfi = pd.DataFrame.from_records(rows_i)
dfi = make_header(dfi)

# 1. Prepare data for model selection - relations between historians

# TOPICS
d1 = df.copy()
# rename columns
d1.columns = ['art_hist_1', 'art_hist_2', 'period', 'target', 'bio', 'collab','archive_1','mention_1','archive_2','mention_2','mention_both']
# drop columns
d1 = d1[['art_hist_1', 'art_hist_2','bio','target','mention_1','mention_2']]
# remove 0,5
d1 = d1[d1['target'] != "0,5"] 
# merge mentions
d1["mention"] = d1["mention_1"] + d1["mention_2"] 
# replace '' with 0
for col in d1[["target","bio","mention"]]: 
  d1[col] = d1[col].replace('',0)
  d1[col] = d1[col].astype(float)
d1['mention'].values[d1['mention'] > 1] = 1

# group by number of topics
d1series = d1.groupby(["art_hist_1","art_hist_2","target","bio","mention"]).size()
d1 = d1series.to_frame(name = 'size').reset_index()

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(d1)

# INSTITUTIONS
d2 = dfi.copy()
# rename columns
d2.columns = ['art_hist_1', 'art_hist_2', 'institution', 'target', 'scope', 'archive_1','mention_1','archive_2','mention_2','notes']
# drop columns
d2 = d2[['art_hist_1', 'art_hist_2','institution','target','mention_1','mention_2']]
# remove 0,5
d2 = d2[d2['target'] != "0,5"] # remove 0,5
# merge mentions
d2["mention"] = d2["mention_1"] + d2["mention_2"] 
# replace '' with 0
for col in d2[["target","mention"]]: 
  d2[col] = d2[col].replace('',0)
  d2[col] = d2[col].astype(float)
d2['mention'].values[d2['mention'] > 1] = 1
# group by number of institutions
d2series = d2.groupby(["art_hist_1","art_hist_2","target","mention"]).size()
d2 = d2series.to_frame(name = 'size').reset_index()

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(d2)

# INSTITUTIONS AND TOPICS
topics_mentions = d1.copy()
topics_mentions.columns = ["art_hist_1","art_hist_2","target_topic","bio","mention_historian","size_topic"]
instits = d2.copy()
instits.columns = ['art_hist_1','art_hist_2', 'target_inst', 'mention_institution', 'size_inst']
# merge tables based on first two columns (considering pairs may appear in different order)
topics_mentions['hash'] = topics_mentions.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
instits['hash'] = instits.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
df_merged = topics_mentions.merge(instits, how='outer', indicator=True, on=["hash"])
# replace nan
df_merged = df_merged.replace(np.nan, 0, regex=True)
# merge target columns
df_merged["target"] = df_merged["target_topic"] + df_merged["target_inst"] 
df_merged['target'].values[df_merged['target'] > 1] = 1
# drop columns
df_merged = df_merged[['art_hist_1_x','art_hist_2_x', 'target', 'bio','mention_historian', 'size_topic','size_inst']]


# 2. Prepare data for model selection - contents of collections

d1coll = df.copy()
# rename columns
d1coll.columns = ['art_hist_1', 'art_hist_2', 'period', 'target', 'bio', 'collab','archive_1','mention_1','archive_2','mention_2','mention_both']
# drop columns
d1coll = d1coll[['art_hist_1','art_hist_2','period','bio','archive_1','archive_2']]
# replace '' with 0
for col in d1coll[["bio",'archive_1','archive_2']]: 
  d1coll[col] = d1coll[col].replace(np.nan,0).replace('',0)
  d1coll[col] = d1coll[col].astype(int)
# merge mentions
d1coll["target"] = d1coll["archive_1"] + d1coll["archive_2"] 
d1coll['target'] = d1coll['target'].astype(int)
d1coll['target'].values[d1coll['target'] > 1] = 1

# group by number of topics
d1collseries = d1coll.groupby(["art_hist_1","art_hist_2",'bio','target']).size()
d1coll = d1collseries.to_frame(name = 'size').reset_index()
# get number of institutions in common
d2['hash'] = d2.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
d1coll['hash'] = d1coll.apply(lambda x: min(x['art_hist_1'], x['art_hist_2']) + "_" + max(x['art_hist_1'], x['art_hist_2']), axis=1)
d1coll_merged = d1coll.merge(d2, how='outer', indicator=True, on=["hash"])
d1coll_merged = d1coll_merged[['art_hist_1_x','art_hist_2_x','bio','target_x', 'size_x','size_y']]
d1coll_merged.columns = ['art_hist_1','art_hist_2','bio','target', 'size_topic','size_inst']
d1coll_merged['size_inst'] = d1coll_merged['size_inst'].replace(np.nan,0)
d1coll_merged = d1coll_merged[d1coll_merged.art_hist_1.notnull()]

In [None]:

len(df_merged.loc[df_merged["bio"] == 1])

15

## EDA


### Artists_periods

In [None]:
df_eda = df.copy()
df_eda.columns = ["art_hist_1", "art_hist_2", "topic", "target", "ref_in_bio", "collab_on_topic", "h2_archive1", "h2_mention", "h1_archive2", "h1_mention", "both_bio"]
df_eda = df_eda[df_eda['target'] != "0,5"] # remove 0,5
for col in df_eda[["target", "ref_in_bio", "collab_on_topic", "h2_archive1", "h2_mention", "h1_archive2", "h1_mention", "both_bio"]]: 
  df_eda[col] = df_eda[col].replace(np.nan,0).replace('',0)
  df_eda[col] = df_eda[col].astype(int)
profiledf = ProfileReport(df_eda, title="Artists and periods", html={'style': {'full_width': True}}, sort="ascending")
profiledf.to_notebook_iframe()


Summarize dataset:   0%|          | 0/25 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# unique historians
unique = len(pd.unique(df_eda[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique topics
uniquet = len(pd.unique(df_eda[['topic']].values.ravel()))
# unique pairs
count = len(df_eda.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# valid relations
valid = df_eda.loc[df_eda["target"] == 1]
# unique historians in valid relations
uniquev =  len(pd.unique(valid[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique pairs in valid relations
countv = len(valid.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# already recorded in bio
valid_in_bio = df_eda.loc[(df_eda["target"] == 1) & (df_eda["ref_in_bio"] == 1)]

# how many relations recorded in at least one bio are valid?
bio = df_eda.loc[df_eda["ref_in_bio"] == 1]
bio_valid = len(valid_in_bio)*100/len(bio)
# how many relations recorded in both bios are valid?
both = df_eda.loc[df_eda["ref_in_bio"] == 1]
both_bio_valid = len(valid_in_bio)*100/len(both)

# merge mentions
df_eda["mention"] = df_eda["h2_mention"] + df_eda["h1_mention"] 
df_eda['mention'].values[df_eda['mention'] > 1] = 1
# how many relations recorded in at least one archival description are valid?
mention = df_eda.loc[df_eda["mention"] == 1]
valid_in_mention = df_eda.loc[(df_eda["target"] == 1) & (df_eda["mention"] == 1)]
mention_valid = len(valid_in_mention)*100/len(mention)

unique, uniquet, count , len(valid), uniquev , countv , len(valid_in_bio), bio_valid, both_bio_valid, mention_valid

(23, 24, 173, 162, 22, 71, 38, 100.0, 100.0, 100.0)

In [None]:
# valid collaborations on a topic
true_collab = df_eda.loc[(df_eda["target"] == 1) & (df_eda["collab_on_topic"] == 1)]

# unique historians
unique = len(pd.unique(true_collab[['art_hist_1', 'art_hist_2']].values.ravel()))

# unique topics
uniquet = len(pd.unique(true_collab[['topic']].values.ravel()))

# unique pairs of historians in valid collaborations on a topic
countv = len(true_collab.groupby(['art_hist_1','art_hist_2']).size().reset_index())

len(true_collab), unique, uniquet, countv

(52, 18, 12, 28)

### Institutions

In [None]:
dfi_eda = dfi.copy()
dfi_eda.columns = ["art_hist_1", "art_hist_2", "institution", "target", "relation", "inst_archive_1", "inst_mention_1", "inst_archive_2", "inst_mention_2", "notes"]
dfi_eda = dfi_eda[dfi_eda['target'] != "0,5"] # remove 0,5
for col in dfi_eda[["target", "inst_archive_1", "inst_mention_1", "inst_archive_2", "inst_mention_2"]]: 
  dfi_eda[col] = dfi_eda[col].replace(np.nan,0).replace('',0).replace('?',0)
  dfi_eda[col] = dfi_eda[col].astype(int)
profiledf = ProfileReport(dfi_eda, title="Institutions", html={'style': {'full_width': True}}, sort="ascending")
profiledf.to_notebook_iframe()

Summarize dataset:   0%|          | 0/24 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# unique historians
unique = len(pd.unique(dfi_eda[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique inst
uniquei = len(pd.unique(dfi_eda[['institution']].values.ravel()))
# unique pairs
count = len(dfi_eda.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# valid relations
valid = dfi_eda.loc[dfi_eda["target"] == 1]
# unique historians in valid relations
uniquev =  len(pd.unique(valid[['art_hist_1', 'art_hist_2']].values.ravel()))
# unique pairs in valid relations
countv = len(valid.groupby(['art_hist_1','art_hist_2']).size().reset_index())
# already recorded in bio
valid_in_bio = df_merged.loc[ (df_merged["bio"] == 1) & (df_merged["size_inst"] == 1)]


unique, uniquei, count , len(valid), uniquev , countv , len(valid_in_bio)

(23, 19, 49, 39, 21, 33, 10)

### Merged tables

In [None]:
dfm_eda = df_merged.copy()
for col in dfm_eda[["art_hist_1_x", "art_hist_2_x"]]: 
  dfm_eda[col] = dfm_eda[col].astype(str)
profiledfm = ProfileReport(dfm_eda, title="Institutions and topics", html={'style': {'full_width': True}}, sort="ascending")
profiledfm.to_notebook_iframe()

Summarize dataset:   0%|          | 0/20 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# relations with at least a topic and an instit in common
both = dfm_eda.loc[(dfm_eda["size_inst"] != 0) & (dfm_eda["size_topic"] != 0)]

# valid relations with at least a topic and an instit in common
both_valid = dfm_eda.loc[(dfm_eda["target"] == 1) & (dfm_eda["size_inst"] != 0) & (dfm_eda["size_topic"] != 0)]

len(both), len(both_valid)

(34, 24)

### Merged tables for collections

In [None]:
# relations with at least a topic 
valid = d1coll_merged.loc[(d1coll_merged["target"] != 0)]
topic = d1coll_merged.loc[(d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
bio = d1coll_merged.loc[(d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0) & (d1coll_merged["bio"] != 0)]
only_bio = d1coll_merged.loc[(d1coll_merged["bio"] != 0)]
valid_in_bio = d1coll_merged.loc[(d1coll_merged["bio"] != 0) & (d1coll_merged["target"] != 0)]
valid_inst = d1coll_merged.loc[(d1coll_merged["size_inst"] != 0) & (d1coll_merged["target"] != 0)]
valid_inst_topic = d1coll_merged.loc[(d1coll_merged["size_inst"] != 0) & (d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
valid_topic = d1coll_merged.loc[(d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
valid_inst_topic_bio = d1coll_merged.loc[(d1coll_merged["size_inst"] != 0) & (d1coll_merged["bio"] != 0) & (d1coll_merged["size_topic"] != 0) & (d1coll_merged["target"] != 0)]
len(valid),len(topic), len(bio), len(only_bio), len(valid_in_bio), len(valid_inst), len(valid_inst_topic), len(valid_topic), len(valid_inst_topic_bio)

(32, 32, 13, 14, 13, 14, 14, 32, 10)

# MODEL SELECTION FOR RELATIONS BETWEEN HISTORIANS



## Predict generic relations based on mentions in bio only

**mean p= 0.8, p(1) = 1, r(1)= 0.19 (all)**

note: low support of references in bio. The precision is very high, meaning the relation between any historians mentioned in another historian's bio is 100% relevant. However, due to data completeness issues this aspect is not sufficient to detect actual interactions.

In [None]:
X = d1[['bio']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       102
         1.0       1.00      0.19      0.33        72

    accuracy                           0.67       174
   macro avg       0.82      0.60      0.55       174
weighted avg       0.79      0.67      0.59       174


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       102
         1.0       1.00      0.19      0.33        72

    accuracy                           0.67       174
   macro avg       0.82      0.60      0.55       174
weighted avg       0.79      0.67      0.59       174


DECISION TREE
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       102
         1.0       1.00      0.19      0.33        72

    accuracy                           0.67       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p= 0.8, p(1) = 1, r(1)= 0.17 (all)**

notes: training on everything, slightly worse recall, but not significant

In [None]:
# train on merged tables
X = df_merged[['bio']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.59      1.00      0.74       106
         1.0       1.00      0.17      0.29        88

    accuracy                           0.62       194
   macro avg       0.80      0.59      0.52       194
weighted avg       0.78      0.62      0.54       194


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.59      1.00      0.74       106
         1.0       1.00      0.17      0.29        88

    accuracy                           0.62       194
   macro avg       0.80      0.59      0.52       194
weighted avg       0.78      0.62      0.54       194


DECISION TREE
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.59      1.00      0.74       106
         1.0       1.00      0.17      0.29        88

    accuracy                           0.62       19

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(topics_mentions_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 100.0; 
 Unknown 0.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict generic relations based on mentions in archival description only

**mean p= 0.8, p(1) = 1, r(1)= 0.17**

note: same considerations for relations based on references in biographies

In [None]:
X = d1[['mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       102
         1.0       1.00      0.17      0.29        72

    accuracy                           0.66       174
   macro avg       0.81      0.58      0.53       174
weighted avg       0.78      0.66      0.57       174


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       102
         1.0       1.00      0.17      0.29        72

    accuracy                           0.66       174
   macro avg       0.81      0.58      0.53       174
weighted avg       0.78      0.66      0.57       174


DECISION TREE
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       102
         1.0       1.00      0.17      0.29        72

    accuracy                           0.66       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p= 0.7, p(1) = 1, r(1)= 0.14**

note: training on everything


In [None]:
X = df_merged[['mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.58      1.00      0.74       106
         1.0       1.00      0.14      0.24        88

    accuracy                           0.61       194
   macro avg       0.79      0.57      0.49       194
weighted avg       0.77      0.61      0.51       194


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.58      1.00      0.74       106
         1.0       1.00      0.14      0.24        88

    accuracy                           0.61       194
   macro avg       0.79      0.57      0.49       194
weighted avg       0.77      0.61      0.51       194


DECISION TREE
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.58      1.00      0.74       106
         1.0       1.00      0.14      0.24        88

    accuracy                           0.61       19

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(topics_mentions_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 66.66666666666666; 
 Unknown 33.33333333333333 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
5,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
8,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
11,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on mentions in bio or archival description

**mean p= 0.9, p(1)=1, r(1)=0.25**

note: same considerations for relations based on references in bio/archival descriptions only

In [None]:
X = d1[['bio','mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       102
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       174
   macro avg       0.83      0.62      0.60       174
weighted avg       0.80      0.69      0.63       174


NAIVE BAYES
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       102
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       174
   macro avg       0.83      0.62      0.60       174
weighted avg       0.80      0.69      0.63       174


DECISION TREE
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       102
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p= 0.9, p(1)=1, r(1)=0.22**

note: training on everything

In [None]:
X = df_merged[['bio','mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.75       106
         1.0       1.00      0.22      0.36        88

    accuracy                           0.64       194
   macro avg       0.80      0.61      0.55       194
weighted avg       0.78      0.64      0.57       194


NAIVE BAYES
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.75       106
         1.0       1.00      0.22      0.36        88

    accuracy                           0.64       194
   macro avg       0.80      0.61      0.55       194
weighted avg       0.78      0.64      0.57       194


DECISION TREE
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.75       106
         1.0       1.00      0.22      0.36        88

    accuracy                           0.64       19

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(topics_mentions_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 76.47058823529412; 
 Unknown 23.52941176470588 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
6,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
10,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
16,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on number of topics only

**mean p=0.57, p(1)=0.65 , r(1)=0.18 (dt)**

notes: this feature alone is not suffient to predict an actual interaction happened. We cannot assume that scholars collaborated despite they were studying the same topics.


In [None]:
X = d1[['size']]
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION")
topics_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
topics_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.5903030303030302
report:
              precision    recall  f1-score   support

         0.0       0.64      0.84      0.73       102
         1.0       0.59      0.32      0.41        72

    accuracy                           0.63       174
   macro avg       0.61      0.58      0.57       174
weighted avg       0.62      0.63      0.60       174


NAIVE BAYES
MEAN PRECISION 0.5736363636363636
report:
              precision    recall  f1-score   support

         0.0       0.63      0.88      0.73       102
         1.0       0.61      0.26      0.37        72

    accuracy                           0.63       174
   macro avg       0.62      0.57      0.55       174
weighted avg       0.62      0.63      0.58       174


DECISION TREE
MEAN PRECISION 0.5716666666666665
report:
              precision    recall  f1-score   support

         0.0       0.62      0.93      0.74       102
         1.0       0.65      0.18      0.28        72

    acc

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.8, p(1)=0.7 , r(1)=0.3 (dt)**

notes: training on everything

In [None]:
X = df_merged[['size_topic']]
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION")
topics_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES")
topics_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
topics_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.705
report:
              precision    recall  f1-score   support

         0.0       0.58      0.92      0.71       106
         1.0       0.67      0.18      0.29        88

    accuracy                           0.59       194
   macro avg       0.62      0.55      0.50       194
weighted avg       0.62      0.59      0.52       194


NAIVE BAYES
MEAN PRECISION 0.6900000000000001
report:
              precision    recall  f1-score   support

         0.0       0.58      0.90      0.70       106
         1.0       0.63      0.22      0.32        88

    accuracy                           0.59       194
   macro avg       0.61      0.56      0.51       194
weighted avg       0.60      0.59      0.53       194


DECISION TREE
MEAN PRECISION 0.8049999999999999
report:
              precision    recall  f1-score   support

         0.0       0.61      0.90      0.73       106
         1.0       0.72      0.32      0.44        88

    accuracy        

## Predicted relations

In [None]:
results = pd.DataFrame(topics_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 21.428571428571427; 
 Unknown 71.42857142857143 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
0,1.0,1.0,Federico Zeri,Fritz Heinemann,0.0
1,1.0,1.0,Federico Zeri,Julian Kliemann,0.0
3,1.0,1.0,Gustav Ludwig,Fritz Heinemann,0.0
4,1.0,1.0,Kurt Badt,Federico Zeri,0.0
5,1.0,1.0,Luisa Vertova,Fritz Heinemann,0.0
6,1.0,1.0,Roberto Longhi,Everett Fahy,0.0
7,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
11,1.0,1.0,Stefano Tumidei,Luisa Vertova,0.0
12,1.0,1.0,Wolfgang Lotz,Federico Zeri,0.0
13,1.0,1.0,Wolfgang Lotz,Julian Kliemann,0.0


## Predict generic relations based on number of topics and mentions in bio

**mean p= 0.8, p(1)=1, r(1)=0.19 (nb)**

notes: the precision is similar to the model considering references in the biography only, while the recall is even lower.

In [None]:
X = d1[['size', 'bio']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.839047619047619
report:
              precision    recall  f1-score   support

         0.0       0.67      0.93      0.78       102
         1.0       0.78      0.35      0.48        72

    accuracy                           0.69       174
   macro avg       0.73      0.64      0.63       174
weighted avg       0.72      0.69      0.66       174


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.64      1.00      0.78       102
         1.0       1.00      0.19      0.33        72

    accuracy                           0.67       174
   macro avg       0.82      0.60      0.55       174
weighted avg       0.79      0.67      0.59       174


DECISION TREE
MEAN PRECISION 0.7340476190476191
report:
              precision    recall  f1-score   support

         0.0       0.65      0.93      0.77       102
         1.0       0.75      0.29      0.42        72

    accuracy           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p= 0.8, p(1)=1, r(1)=0.17 (nb)**

note: training on everything

In [None]:
X = df_merged[['size_topic', 'bio']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION


  _warn_prf(average, modifier, msg_start, len(result))


MEAN PRECISION 0.7333333333333333
report:
              precision    recall  f1-score   support

         0.0       0.59      0.97      0.73       106
         1.0       0.83      0.17      0.28        88

    accuracy                           0.61       194
   macro avg       0.71      0.57      0.51       194
weighted avg       0.70      0.61      0.53       194


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.59      1.00      0.74       106
         1.0       1.00      0.17      0.29        88

    accuracy                           0.62       194
   macro avg       0.80      0.59      0.52       194
weighted avg       0.78      0.62      0.54       194


DECISION TREE
MEAN PRECISION 0.8434523809523811
report:
              precision    recall  f1-score   support

         0.0       0.65      0.90      0.75       106
         1.0       0.77      0.42      0.54        88

    accuracy                           0.68

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(topics_mentions_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 61.904761904761905; 
 Unknown 33.33333333333333 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
2,1.0,1.0,Federico Zeri,Fritz Heinemann,0.0
4,1.0,1.0,Federico Zeri,Julian Kliemann,0.0
6,1.0,1.0,Gustav Ludwig,Fritz Heinemann,0.0
8,1.0,1.0,Kurt Badt,Federico Zeri,0.0
10,1.0,1.0,Luisa Vertova,Fritz Heinemann,0.0
13,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
19,1.0,1.0,Stefano Tumidei,Luisa Vertova,0.0


## Predict generic relations based on number of topics and mentions in archival description

**mean p=0.8, p(1)=1, r(1)=0.17**

notes: the precision is similar to the model considering references in the archival description only, while the recall is even lower.

In [None]:
X = d1[['size', 'mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8383333333333333
report:
              precision    recall  f1-score   support

         0.0       0.66      0.93      0.77       102
         1.0       0.77      0.32      0.45        72

    accuracy                           0.68       174
   macro avg       0.71      0.63      0.61       174
weighted avg       0.70      0.68      0.64       174


NAIVE BAYES
MEAN PRECISION 0.8
report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       102
         1.0       1.00      0.17      0.29        72

    accuracy                           0.66       174
   macro avg       0.81      0.58      0.53       174
weighted avg       0.78      0.66      0.57       174


DECISION TREE
MEAN PRECISION 0.6383333333333333
report:
              precision    recall  f1-score   support

         0.0       0.65      0.93      0.76       102
         1.0       0.74      0.28      0.40        72

    accuracy          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.7, p(1)=1, r(1)=0.14**

note: training on everything

In [None]:
X = df_merged[['size_topic', 'mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION


  _warn_prf(average, modifier, msg_start, len(result))


MEAN PRECISION 0.575
report:
              precision    recall  f1-score   support

         0.0       0.57      0.95      0.71       106
         1.0       0.71      0.14      0.23        88

    accuracy                           0.58       194
   macro avg       0.64      0.54      0.47       194
weighted avg       0.63      0.58      0.49       194


NAIVE BAYES
MEAN PRECISION 0.7
report:
              precision    recall  f1-score   support

         0.0       0.58      1.00      0.74       106
         1.0       1.00      0.14      0.24        88

    accuracy                           0.61       194
   macro avg       0.79      0.57      0.49       194
weighted avg       0.77      0.61      0.51       194


DECISION TREE
MEAN PRECISION 0.7367857142857143
report:
              precision    recall  f1-score   support

         0.0       0.63      0.90      0.74       106
         1.0       0.75      0.38      0.50        88

    accuracy                           0.66       194
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(topics_mentions_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 66.66666666666666; 
 Unknown 33.33333333333333 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
5,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
8,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
11,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on number of topics, mentions in bio or archival description

**mean p=0.9, p(1)=1, r(1)=0.25 (nb)**

notes: similar considerations as in prior models. We can assume the reason why historians are mentioned in others' biographies or archival descriptions is not always due to the amount of topics they share.

In [None]:
X = d1[['size', 'bio','mention']].copy()
y = d1['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8557142857142856
report:
              precision    recall  f1-score   support

         0.0       0.68      0.93      0.79       102
         1.0       0.79      0.38      0.51        72

    accuracy                           0.70       174
   macro avg       0.74      0.65      0.65       174
weighted avg       0.73      0.70      0.67       174


NAIVE BAYES
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79       102
         1.0       1.00      0.25      0.40        72

    accuracy                           0.69       174
   macro avg       0.83      0.62      0.60       174
weighted avg       0.80      0.69      0.63       174


DECISION TREE
MEAN PRECISION 0.78
report:
              precision    recall  f1-score   support

         0.0       0.66      0.95      0.78       102
         1.0       0.81      0.29      0.43        72

    accuracy                        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.9, p(1)=1, r(1)=0.22 (nb)**

note: training on everything

In [None]:
X = df_merged[['size_topic', 'bio','mention_historian']].copy()
y = df_merged['target'].astype(float)

# logistic regression
print("\nLOGISTIC REGRESSION") 
topics_mentions_lr_model = lr(X,y)

# naive bayes
print("\nNAIVE BAYES") 
topics_mentions_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE") 
topics_mentions_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.85
report:
              precision    recall  f1-score   support

         0.0       0.60      0.97      0.74       106
         1.0       0.86      0.22      0.35        88

    accuracy                           0.63       194
   macro avg       0.73      0.59      0.54       194
weighted avg       0.72      0.63      0.56       194


NAIVE BAYES
MEAN PRECISION 0.9
report:
              precision    recall  f1-score   support

         0.0       0.61      1.00      0.75       106
         1.0       1.00      0.22      0.36        88

    accuracy                           0.64       194
   macro avg       0.80      0.61      0.55       194
weighted avg       0.78      0.64      0.57       194


DECISION TREE
MEAN PRECISION 0.8551190476190476
report:
              precision    recall  f1-score   support

         0.0       0.66      0.90      0.76       106
         1.0       0.78      0.45      0.58        88

    accuracy                        

  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(topics_mentions_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 76.47058823529412; 
 Unknown 23.52941176470588 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Aby Warburg,0.0
6,1.0,1.0,Kornél Fabriczy,Ernst Steinmann,0.0
10,1.0,1.0,Roberto Longhi,Luisa Vertova,0.0
16,1.0,1.0,Ulrich Middeldorf,Everett Fahy,0.0


## Predict generic relations based on number of institutions

**mean p = 0.65, p(1)= 0.67, r(1)= 0.91 (nb)**

notes:  

In [None]:
x = d2[['size']]
y = d2['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
inst_lr_model = lr(x,y)

# naive bayes
print("NAIVE BAYES")
inst_model = naive_bayes(x,y)

# decision tree
print("\nDECISION TREE")
inst_tree = decision_tree(x,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.6466666666666666
report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        17
         1.0       0.66      0.94      0.78        35

    accuracy                           0.63        52
   macro avg       0.33      0.47      0.39        52
weighted avg       0.44      0.63      0.52        52

NAIVE BAYES
MEAN PRECISION 0.655
report:
              precision    recall  f1-score   support

         0.0       0.25      0.06      0.10        17
         1.0       0.67      0.91      0.77        35

    accuracy                           0.63        52
   macro avg       0.46      0.49      0.43        52
weighted avg       0.53      0.63      0.55        52


DECISION TREE
MEAN PRECISION 0.6399999999999999
report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        17
         1.0       0.65      0.91      0.76        35

    accuracy         

**mean p = 0.83, p(1)= 0.74, r(1)= 0.44 (nb)**

note: training on everything


In [None]:
x = df_merged[['size_inst']]
y = df_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
inst_lr_model = lr(x,y)

# naive bayes
print("NAIVE BAYES")
inst_model = naive_bayes(x,y)

# decision tree
print("\nDECISION TREE")
inst_tree = decision_tree(x,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.8316666666666667
report:
              precision    recall  f1-score   support

         0.0       0.65      0.87      0.74       106
         1.0       0.74      0.44      0.55        88

    accuracy                           0.68       194
   macro avg       0.69      0.66      0.65       194
weighted avg       0.69      0.68      0.66       194

NAIVE BAYES
MEAN PRECISION 0.8316666666666667
report:
              precision    recall  f1-score   support

         0.0       0.65      0.87      0.74       106
         1.0       0.74      0.44      0.55        88

    accuracy                           0.68       194
   macro avg       0.69      0.66      0.65       194
weighted avg       0.69      0.68      0.66       194


DECISION TREE
MEAN PRECISION 0.8251748251748252
report:
              precision    recall  f1-score   support

         0.0       0.64      0.88      0.74       106
         1.0       0.73      0.41      0.53        88

    accu

## Predicted relations

In [None]:
results = pd.DataFrame(inst_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 43.47826086956522; 
 Unknown 52.17391304347826 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Ellis Waterhouse,0.0
2,1.0,1.0,Federico Zeri,Ernst Kitzinger,0.0
6,1.0,1.0,Julian Kliemann,Richard Krautheimer,0.0
7,1.0,1.0,Julius S. Held,Federico Zeri,0.0
8,1.0,1.0,Leo Steinberg,John Pope-Hennessy,0.0
10,1.0,1.0,Luigi Salerno,Federico Zeri,0.0
11,1.0,1.0,Richard Krautheimer,Aby Warburg,0.0
18,1.0,1.0,Ulrich Middeldorf,Aby Warburg,0.0
19,1.0,1.0,Wolfgang Lotz,Ernst Steinmann,0.0
20,1.0,1.0,Wolfgang Lotz,Julian Kliemann,0.0


## Predict generic relations based on number of institutions and number of topics

**mean p=0.79, p(1)=0.73, r(1)=0.51 (dt)**



In [None]:
# Merge institutions and artists_periods tables
X = df_merged[['size_inst', 'size_topic']].copy()
y = df_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7494551282051282
report:
              precision    recall  f1-score   support

         0.0       0.67      0.81      0.74       106
         1.0       0.70      0.52      0.60        88

    accuracy                           0.68       194
   macro avg       0.68      0.67      0.67       194
weighted avg       0.68      0.68      0.67       194

NAIVE BAYES
MEAN PRECISION 0.7551694139194141
report:
              precision    recall  f1-score   support

         0.0       0.69      0.81      0.74       106
         1.0       0.71      0.56      0.62        88

    accuracy                           0.70       194
   macro avg       0.70      0.68      0.68       194
weighted avg       0.70      0.70      0.69       194


DECISION TREE
MEAN PRECISION 0.7936813186813186
report:
              precision    recall  f1-score   support

         0.0       0.67      0.84      0.75       106
         1.0       0.73      0.51      0.60        88

    accu

## Predicted relations

In [None]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = df_merged[["target", "art_hist_1_x","art_hist_2_x"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 36.666666666666664; 
 Unknown 60.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Federico Zeri,Ellis Waterhouse,0.0
2,1.0,1.0,Federico Zeri,Ernst Kitzinger,0.0
4,1.0,1.0,Federico Zeri,Fritz Heinemann,0.0
6,1.0,1.0,Federico Zeri,Julian Kliemann,0.0
9,1.0,1.0,Julian Kliemann,Richard Krautheimer,0.0
10,1.0,1.0,Julius S. Held,Federico Zeri,0.0
11,1.0,1.0,Kurt Badt,Federico Zeri,0.0
12,1.0,1.0,Leo Steinberg,John Pope-Hennessy,0.0
14,1.0,1.0,Luigi Salerno,Federico Zeri,0.0
15,1.0,1.0,Richard Krautheimer,Aby Warburg,0.0


# MODEL SELECTION FOR RELATIONS BETWEEN COLLECTIONS 


## Predict historians relevant to collections based on their mention in biography

**mean p=0.75, p(1)=0.92, r(1)=0.39 (dt)**

In [None]:
X = d1coll[['bio']].copy()
y = d1coll['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176

NAIVE BAYES
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176


DECISION TREE
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**mean p=0.76, p(1)=0.93, r(1)=0.41 (dt)**

note: train on everything

In [None]:
X = d1coll_merged[['bio']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177

NAIVE BAYES
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177


DECISION TREE
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accu

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 100.0; 
 Unknown 0.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict historians relevant to collections based on their topics in common

**mean p=0, p(0)=0.82, r(0)=1**

note: performs very well in detecting non relevant historians

In [None]:

X = d1coll[['size']].copy()
y = d1coll['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       145
           1       0.00      0.00      0.00        31

    accuracy                           0.82       176
   macro avg       0.41      0.50      0.45       176
weighted avg       0.68      0.82      0.74       176

NAIVE BAYES
MEAN PRECISION 0.05
report:
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       145
           1       0.10      0.03      0.05        31

    accuracy                           0.78       176
   macro avg       0.46      0.49      0.46       176
weighted avg       0.69      0.78      0.73       176


DECISION TREE
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       145
           1       0.00      0.00      0.00        31

    accuracy                           0.82       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

**mean p=0, p(0)=0.82, r(0)=1 (lr, dt)**


In [None]:
d1coll_merged_ = d1coll_merged.copy()
d1coll_merged_[['size_topic']].values[d1coll_merged_['size_topic'] > 1] = 1 
X = d1coll_merged_[['size_topic']].copy()
y = d1coll_merged_['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90       145
         1.0       0.00      0.00      0.00        32

    accuracy                           0.82       177
   macro avg       0.41      0.50      0.45       177
weighted avg       0.67      0.82      0.74       177

NAIVE BAYES
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.81      0.97      0.88       145
         1.0       0.00      0.00      0.00        32

    accuracy                           0.79       177
   macro avg       0.41      0.48      0.44       177
weighted avg       0.67      0.79      0.72       177


DECISION TREE
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90       145
         1.0       0.00      0.00      0.00        32

    accuracy                           0.82       177

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## Predicted relations

In [None]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100 if len(merged_piu) > 0 else 0
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100 if len(merged_piu) > 0 else 0

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 0; 
 Unknown 0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict historians relevant to collections based on their mention in biography and topics in common

**mean p=0.75, p(1)=0.92, r(1)=0.39 (dt)**

notes: it seems having topics in common does not help to classify better

In [None]:
X = d1coll[['bio','size']].copy()
y = d1coll['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176

NAIVE BAYES
MEAN PRECISION 0.75
report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.94       145
           1       0.92      0.39      0.55        31

    accuracy                           0.89       176
   macro avg       0.90      0.69      0.74       176
weighted avg       0.89      0.89      0.87       176


DECISION TREE
MEAN PRECISION 0.65
report:
              precision    recall  f1-score   support

           0       0.87      0.99      0.93       145
           1       0.91      0.32      0.48        31

    accuracy                           0.88       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
X = d1coll_merged[['bio','size_topic']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177

NAIVE BAYES
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.94       145
         1.0       0.93      0.41      0.57        32

    accuracy                           0.89       177
   macro avg       0.91      0.70      0.75       177
weighted avg       0.89      0.89      0.87       177


DECISION TREE
MEAN PRECISION 0.7666666666666666
report:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.93       145
         1.0       0.92      0.38      0.53        32

    accu

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(all_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100

print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 100.0; 
 Unknown 0.0 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio


## Predict historians relevant to collections based on shared institutions

**mean p=0.2, p(1)=0.6, r(1)=0.09 (lr)**

In [None]:
X = d1coll_merged[['size_inst']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.2
report:
              precision    recall  f1-score   support

         0.0       0.83      0.99      0.90       145
         1.0       0.60      0.09      0.16        32

    accuracy                           0.82       177
   macro avg       0.72      0.54      0.53       177
weighted avg       0.79      0.82      0.77       177

NAIVE BAYES
MEAN PRECISION 0.15833333333333333
report:
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88       145
         1.0       0.33      0.12      0.18        32

    accuracy                           0.80       177
   macro avg       0.58      0.53      0.53       177
weighted avg       0.74      0.80      0.76       177


DECISION TREE
MEAN PRECISION 0.0
report:
              precision    recall  f1-score   support

         0.0       0.82      0.99      0.89       145
         1.0       0.00      0.00      0.00        32

    accuracy                         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## Predicted relations

In [None]:
results = pd.DataFrame(all_lr_model,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100
print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 66.66666666666666; 
 Unknown 33.33333333333333 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
1,1.0,1.0,Leo Steinberg,Everett Fahy,0.0


## Predict historians relevant to collections based on their topics in common and institutions

**mean p=0.5, p(1)=1, r(1)=0.19 (dt)**


In [None]:
X = d1coll_merged[['size_topic','size_inst']].copy()
y = d1coll_merged['target']

# logistic regression
print("\nLOGISTIC REGRESSION") 
all_lr_model = lr(X,y)

# naive bayes
print("NAIVE BAYES")
all_model = naive_bayes(X,y)

# decision tree
print("\nDECISION TREE")
all_tree = decision_tree(X,y)


LOGISTIC REGRESSION
MEAN PRECISION 0.2
report:
              precision    recall  f1-score   support

         0.0       0.83      0.99      0.90       145
         1.0       0.60      0.09      0.16        32

    accuracy                           0.82       177
   macro avg       0.72      0.54      0.53       177
weighted avg       0.79      0.82      0.77       177

NAIVE BAYES
MEAN PRECISION 0.32
report:
              precision    recall  f1-score   support

         0.0       0.84      0.95      0.89       145
         1.0       0.42      0.16      0.23        32

    accuracy                           0.81       177
   macro avg       0.63      0.55      0.56       177
weighted avg       0.76      0.81      0.77       177


DECISION TREE
MEAN PRECISION 0.5
report:
              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92       145
         1.0       1.00      0.19      0.32        32

    accuracy                           0.85       17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Predicted relations

In [None]:
results = pd.DataFrame(all_tree,columns =['results'])
results[['target', 'art_hist_1','art_hist_2']] = d1coll_merged[["target", "art_hist_1","art_hist_2"]].values
# predicted correct relations
ppositive = results.loc[(results["results"] == 1.0) & (results["target"] == 1.0)]
# get whether the predictions were already in ARTchives or not
merged_piu = ppositive.merge(d1[["art_hist_1","art_hist_2","bio"]], on=["art_hist_1","art_hist_2"], how='left').drop_duplicates()

known = len(merged_piu.loc[(merged_piu["bio"] == 1.0)])/len(merged_piu)*100
unknown = len(merged_piu.loc[(merged_piu["bio"] == 0.0)])/len(merged_piu)*100
print(f"Known {known}; \n Unknown {unknown} \n New relations")
merged_piu.loc[(merged_piu["bio"] == 0.0)]

Known 83.33333333333334; 
 Unknown 16.666666666666664 
 New relations


Unnamed: 0,results,target,art_hist_1,art_hist_2,bio
3,1.0,1.0,Luigi Salerno,Federico Zeri,0.0
