In [1]:
import pandas as pd

## Testset 1

This notebook creates a sampled testset to compute / estimate the precision and the recall of the methods.
It is created as follows:
1. filter for entries that have wikicfp_identifier <b>and</b> the DBLP_identifier, since these entries tend to have much information
2. sample randomly from the filtered_dataset
3. save the csv-file under datasets/wikidata/testset_v1.csv

In [2]:
path = "../../datasets/wikidata/wikidata_conf_data.csv"
full_data = pd.read_csv(path, header=0, index_col=0)

In [3]:
filtered_data = full_data.loc[
    (full_data['WikiCFP_identifier'].isna() == False) &
    (full_data['DBLP_identifier'].isna() == False)]

In [4]:
sampled_data = filtered_data.sample(n=100, random_state=42)

In [5]:
sampled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 4507 to 6908
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   conf_label          100 non-null    object 
 1   title               88 non-null     object 
 2   country             96 non-null     object 
 3   location            97 non-null     object 
 4   main_subject        29 non-null     object 
 5   start_time          91 non-null     object 
 6   end_time            91 non-null     object 
 7   series_label        100 non-null    object 
 8   series_short_name   95 non-null     object 
 9   beginnings          40 non-null     float64
 10  WikiCFP_identifier  100 non-null    float64
 11  DBLP_identifier     100 non-null    object 
dtypes: float64(2), object(10)
memory usage: 10.2+ KB


In [6]:
path_for_sampled_data = "../../datasets/wikidata/testset_v1_opt.csv"

In [7]:
sampled_data.to_csv(path_for_sampled_data, sep=';')

## Testset 2

In [2]:
import pandas as pd

In [26]:
path = "../../datasets/proceedings.com/all-nov-23.xlsx"
full_data = pd.read_excel(path, engine='openpyxl')
full_data = full_data.drop(columns=["Subject1", "Subject2", "Subject3", "Subject4", "List Price", "ISBN"])

In [27]:
filtered_data = full_data.loc[full_data.isna().sum(axis=1) < 1,:]
filtered_data.shape

(14475, 11)

In [28]:
sampled_data = filtered_data.sample(n=100, random_state=42)

In [29]:
sampled_data = sampled_data.drop(columns=["Publisher", "Editor", "Pages", "Format"])

In [30]:
sampled_data.to_csv("../../datasets/proceedings.com/testset_v2.csv", sep=";")