In [85]:
import numpy as np
import pandas as pd
from wikidataquery import WikidataQuery

## First let's search for all proceedings on Wikidata that have the property P31 which is 'instance of'

In [86]:
temp = WikidataQuery.queryWikiData('''SELECT ?proceeding ?proceedingLabel ?series ?seriesLabel
WHERE {
  ?proceeding wdt:P31 wd:Q1143604.
  ?proceeding wdt:P179 ?series
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}''')

In [87]:
# untangling the results in order to get a dataframe

results = temp['results']['bindings']
proc_entity = np.array([result['proceeding']['value'].split('entity/')[1] for result in results])
proc_labels = np.array([result['proceedingLabel']['value'] for result in results])
series_entity = np.array([result['series']['value'].split('entity/')[1] for result in results])
series_labels = np.array([result['seriesLabel']['value'] for result in results])

In [88]:
# zip all arrays together to create a DataFrame // further read out 200 samples from the original 6742 (Stand 07.12.)

together = zip(proc_entity, proc_labels, series_entity, series_labels)
df = pd.DataFrame(together, columns=['proc_entity', 'proc_labels', 'series_entity', 'series_labels'])
df.sample(200, random_state=42)

Unnamed: 0,proc_entity,proc_labels,series_entity,series_labels
4515,Q21898129,Advances in Cryptology — EUROCRYPT 2005,Q924044,Lecture Notes in Computer Science
734,Q113542117,Joint Proceedings of the 2nd International Wor...,Q27230297,CEUR Workshop Proceedings
2648,Q113543240,Anais do III Congresso sobre Tecnologias na Ed...,Q27230297,CEUR Workshop Proceedings
2314,Q113545332,Proceedings of the 1st CUBIST (Combining and U...,Q27230297,CEUR Workshop Proceedings
3804,Q115118105,"Proceedings of the NetSciLA 2022 Workshop ""Net...",Q27230297,CEUR Workshop Proceedings
...,...,...,...,...
1419,Q113542593,WiP Proceedings of the International Conferenc...,Q27230297,CEUR Workshop Proceedings
1313,Q113544648,Proceedings of the First DIACHRON Workshop on ...,Q27230297,CEUR Workshop Proceedings
291,Q113541760,Proceedings of the 20th Belgium-Netherlands So...,Q27230297,CEUR Workshop Proceedings
4461,Q58821196,Proceedings of the 10th International Conferen...,Q27230297,CEUR Workshop Proceedings


## Now let's consider the number of total academic conferences of Wikidata (P31='wissenschaftliche Konferenz')

In [89]:
temp2 = WikidataQuery.queryWikiData('''SELECT ?conferences ?conferencesLabel ?series ?seriesLabel
WHERE {
  ?conferences wdt:P31 wd:Q2020153.
  ?conferences wdt:P179 ?series
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}''')

In [90]:
# untangling the results in order to get a dataframe

results = temp2['results']['bindings']
conf_entity = np.array([result['conferences']['value'].split('entity/')[1] for result in results])
conf_labels = np.array([result['conferencesLabel']['value'] for result in results])
series2_entity = np.array([result['series']['value'].split('entity/')[1] for result in results])
series2_labels = np.array([result['seriesLabel']['value'] for result in results])

In [91]:
# zip all arrays together to create a DataFrame

together2 = zip(conf_entity, conf_labels, series2_entity, series2_labels)
df2 = pd.DataFrame(together2, columns=['conf_entity', 'conf_labels', 'series_entity', 'series_labels'])
df2.sample(200, random_state=42)

Unnamed: 0,conf_entity,conf_labels,series_entity,series_labels
6669,Q116657073,"19th International Conference on Science, Tech...",Q116257252,"International Conference on Science, Technolog..."
2166,Q105943021,"INNS Conference on Big Data 2015, San Francisc...",Q105700082,International Neural Network Society Winter Co...
3422,Q106243617,Functional and Logic Programming - 12th Intern...,Q105693690,International Symposium on Functional and Logi...
1905,Q106242915,24th Asia-Pacific Software Engineering Confere...,Q105693374,Asia-Pacific Software Engineering Conference
1454,Q106245013,Tools and Algorithms for the Construction and ...,Q105693137,International Conference on Tools and Algorith...
...,...,...,...,...
4861,Q106068288,"Static Analysis, 15th International Symposium,...",Q105694643,International Static Analysis Symposium
1872,Q106242834,2016 IEEE International Conference on Pervasiv...,Q105693372,Annual IEEE International Conference and Works...
2669,Q106245847,Mobile Networks and Management - 5th Internati...,Q105696226,Mobile Lightweight Wireless Systems
4304,Q106068038,Information and Software Technologies - 24th I...,Q105699857,International Conference on Information and So...


## Let's check here how many unique series labels and entities I have:

In [92]:
df2.series_labels.unique()

array(['International Conference on Quantum Communication, Measurement and Computing (QCMC)',
       'International Symposium on String Processing and Information Retrieval',
       'Extended Semantic Web Conference', ...,
       'ACM conference on Web Science',
       'International Conference on Computer Communication', 'Q115475014'],
      dtype=object)

# Questions that I have:
- Do I create one or many benchmarks?
- What should the benchmark be based on?
- Should I create a list where all conference series are mapped to an identifier (delete duplicates)