## Import libraries

In [56]:
import pathlib as path
import pandas as pd
import plotly.express as px
from collections import  Counter
import scipy.stats as stats
from IPython.display import display, Markdown

## Define constants

In [57]:
LOCAL_FOLDER = path.Path("/mnt/c/Users/Professional/Desktop/Works/MEANING/Sem-Covid19/data/")

SERVER_FOLDER  = path.Path("/home/jovyan/data/")

WORK_DIR = ""
if SERVER_FOLDER.exists():
    WORK_DIR = SERVER_FOLDER
    print("Work with distant directory.")
elif LOCAL_FOLDER.exists():
    WORK_DIR = LOCAL_FOLDER
    print("Work with local directory.")
else:
    print("ERROR: Invalid directory!")

SRC_CORE_FILE_NAME = "eurlex.json"
SRC_EXT_FILE_NAME = "eurlex-extended.json"
SRC_CORE_FILE_PATH = WORK_DIR / SRC_CORE_FILE_NAME
SRC_EXT_FILE_PATH = WORK_DIR / SRC_EXT_FILE_NAME

Work with distant directory.


## Load data sets Cellar core and Cellar Extended

In [58]:
if SRC_CORE_FILE_PATH.exists():
    df_core = pd.read_json(SRC_CORE_FILE_PATH)
else:
    print("Source path for core dataset is invalid!")

if SRC_EXT_FILE_PATH.exists():
    df_ext = pd.read_json(SRC_EXT_FILE_PATH)
else:
    print("Source path for extended dataset is invalid!")

## Analyze collision rate between Cellar core and extend datasets

In [59]:
work_id_core = df_core["work"]
work_id_ext = df_ext["work"]
set_core = set(work_id_core)
set_ext = set(work_id_ext)
set_common = set_core.intersection(set_ext)
collision_rate = round(100*len(set_common)/len(set_core),2)
display(Markdown(f"Collision between Cellar core and extend datastet is {collision_rate}%"))

Collision between Cellar core and extend datastet is 93.62%

## Analyze difference between collision rows

In [75]:
common_columns = set(df_core.columns).intersection(set(df_ext.columns))
slice_core = df_core[common_columns]
slice_ext = df_ext[common_columns]
column_with_difference = set()
for column in common_columns:
    for work_id in set_common:
        row_core = slice_core.loc[slice_core["work"]==work_id]
        row_ext = slice_ext.loc[slice_ext["work"]==work_id]
        a = row_core[column].explode()
        b = row_ext[column].explode()
        if Counter(a) != Counter(b):
            print("------------------------")
            print(a)
            print(b)
            print("------------------------")
            column_with_difference.add(column)
            break

print("Columns with difference is : ",column_with_difference)


------------------------
338                 Slovakia
338       commercial vehicle
338                 epidemic
338       infectious disease
338            motor vehicle
338              road safety
338     roadworthiness tests
338    technical regulations
Name: eurovoc_concept_labels, dtype: object
868              epidemic
868    infectious disease
Name: eurovoc_concept_labels, dtype: object
------------------------
------------------------
338    http://eurovoc.europa.eu/1759
338    http://eurovoc.europa.eu/3117
338    http://eurovoc.europa.eu/3641
338    http://eurovoc.europa.eu/4047
338    http://eurovoc.europa.eu/4654
338    http://eurovoc.europa.eu/4658
338    http://eurovoc.europa.eu/5859
338     http://eurovoc.europa.eu/837
Name: eurovoc_concepts, dtype: object
868    http://eurovoc.europa.eu/1759
868     http://eurovoc.europa.eu/837
Name: eurovoc_concepts, dtype: object
------------------------
Columns with difference is :  {'eurovoc_concept_labels', 'eurovoc_concepts'}


# The result of this analysis is :
- The collision between Cellar core and Cellar Extended is 93.62%.
- We can easily combine Cellar core and Cellar extended datasets.
It is only a matter of time before we need to keep in mind
that for columns ({'eurovoc_concept_labels', 'eurovoc_concepts'})
it is necessary to combine them by the set joining operation.
