In [None]:
import zipfile
import pandas as pd
from io import TextIOWrapper

def parse_rrf_data_line(line):
    # Split the line using the '|' delimiter
    values = line.strip().split('|')

    # Create a dictionary with column names and values
    data_dict = {f'Col_{i+1}': val for i, val in enumerate(values)}

    return data_dict

def process_files_in_zip(zip_file_path,f_name):
    parsed_data = []  # List to store parsed content as DataFrames

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            file_name = file_info.filename

            if file_name.endswith('.RRF') and (file_name.startswith(f_name) ):
                print(file_name)
                with zip_ref.open(file_name) as file:
                    # Process the RRF file line by line using your parser function
                    parsed_content = []
                    with TextIOWrapper(file, encoding='utf-8') as text_file:
                        for line in text_file:
                            parsed_line = parse_rrf_data_line(line)
                            parsed_content.append(parsed_line)

                    # Convert the list of dictionaries to a DataFrame
                    parsed_data = pd.DataFrame(parsed_content)



    return parsed_data  # Return the list of parsed content


In [None]:
zip_file_path = '/home/jovyan/work/umls-2023AB-metathesaurus-full.zip'
file_n = '2023AB/META/MRCONSO.RRF'
parsed_data_name = process_files_in_zip(zip_file_path,file_n)

In [None]:
parsed_data_name = parsed_data_name[parsed_data_name['Col_2'] == 'ENG']
columns_to_keep = [0, 7, 14]
parsed_data_name = parsed_data_name.iloc[:, columns_to_keep]

In [None]:
zip_file_path = '/home/jovyan/work/umls-2023AB-metathesaurus-full.zip'
file_n = '2023AB/META/MRHIER.RRF'
parsed_data_hier = process_files_in_zip(zip_file_path,file_n)

In [None]:
columns_to_keep = [0, 1, 6]
parsed_data_hier = parsed_data_hier.iloc[:, columns_to_keep]

In [None]:
df_hier_no_nan = parsed_data_hier[parsed_data_hier['Col_7'].notna()]

In [None]:
parsed_data_hier.info()

In [None]:
!pip install rdfpandas

In [None]:
!pip install rdflib

In [None]:
import rdflib
import pandas as pd
from rdfpandas.graph import to_dataframe
from collections import defaultdict

In [None]:
def load_ttl_files(pathfilename):
    g = rdflib.Graph()
    g.parse(pathfilename)
    data = []

    for subject, predicate, obj in g:
        data.append((subject, predicate, obj))

    df = pd.DataFrame(data, columns=['s', 'p', 'o'])

    return df


In [None]:
def has_hierarchy_associative_clash(df):
    violating_dict = defaultdict(list)
    related = rdflib.URIRef("http://www.w3.org/2004/02/skos/core#related")
    broader = rdflib.URIRef("http://www.w3.org/2004/02/skos/core#broader")

    # Create a set for faster membership checks
    df_set = {(row.s, row.p, row.o) for row in df.itertuples(index=False)}

    # Create an inverted index for related concepts
    related_concepts = defaultdict(set)
    broader_concepts = defaultdict(set)
    for row in df_set:
        concept, relation, other_concept = row
        if relation == related:
            related_concepts[concept].add((concept, relation, other_concept))
        if relation == broader:
            broader_concepts[concept].add((concept, relation, other_concept))

    for row in df_set:
        concept, relation, other_concept = row

        if relation == related:
            if (concept in broader_concepts and (concept, broader, other_concept) in broader_concepts[concept]) or (other_concept in broader_concepts and (other_concept, broader, concept) in broader_concepts[other_concept]) :
                violating_dict[concept].append((concept, relation, other_concept))
                violating_dict[other_concept].append((concept, broader, other_concept))
                df = df[~((df['s'] == concept) & (df['p'] == relation) & (df['o'] == other_concept))].reset_index(drop=True)

        elif relation == broader:
            if (concept in related_concepts and (concept, related, other_concept) in related_concepts[concept]) or (other_concept in related_concepts and (other_concept, related, concept) in related_concepts[other_concept]) :
                violating_dict[concept].append((concept, relation, other_concept))
                violating_dict[concept].append((concept, related, other_concept))
                df = df[~((df['s'] == concept) & (df['p'] == relation) & (df['o'] == other_concept))].reset_index(drop=True)

    if violating_dict:
        violating_df = pd.DataFrame([item for sublist in violating_dict.values() for item in sublist], columns=['s', 'p', 'o'])
        return df, violating_df
    else:
        print("Hierarchy is consistent in terms of hierarchical & associative links clashes")
        return df, None


In [None]:
def remove_and_update_kg(kg_df, negative_df):

    for _, violating_triple in negative_df.iterrows():
        s = violating_triple['s']
        p = violating_triple['p']
        o = violating_triple['o']
        kg_df = kg_df[~((kg_df['s'] == s) & (kg_df['p'] == p) & (kg_df['o'] == o))].reset_index(drop=True)

       

    return kg_df

In [None]:
#train,test and valid set splits
def split_dataset(df):
    df_shuffled = df.sample(frac=1, random_state=42)

    total_samples = df_shuffled.shape[0]
    train_size = int(0.8 * total_samples)
    valid_size = int(0.1 * total_samples)
    test_size = total_samples - train_size - valid_size

    df_train = df_shuffled.iloc[:train_size]
    df_valid = df_shuffled.iloc[train_size:train_size + valid_size]
    df_test = df_shuffled.iloc[train_size + valid_size:]

    return df_train, df_valid, df_test



In [None]:
def convert_to_id_files(df):
    entities_s = pd.DataFrame({'e':[]})
    entities_o = pd.DataFrame({'e':[]})
    # Extract unique entities and relations
    entities_s['e'] = df['s']
    entities_o['e'] = df['o']
    entities_all = pd.concat([entities_s ,entities_o],ignore_index=True).reset_index()
    relations = df['p'].drop_duplicates()
    entities = entities_all['e'].drop_duplicates()

    e_to_id = {}
    rel_to_id = {}

    # Save entity IDs to a text file
    with open('entities.dict', 'w') as entity_file:

        for idx, entity in enumerate(entities):
            entity_file.write(f"{idx}\t{entity}\n")
            e_to_id[entity] = idx

    # Save relation IDs to a text file
    with open('relations.dict', 'w') as relation_file:

        for idx, relation in enumerate(relations):
            relation_file.write(f"{idx}\t{relation}\n")
            rel_to_id[relation] = idx
    return e_to_id, rel_to_id

In [None]:
def convert_triples_to_files(entity_to_id, relation_to_id,text_df,df):

    # Save triple IDs to a text file
    with open(text_df, 'w') as triples_file:

        for _, row in df.iterrows():
            subject =row['s']
            relation = row['p']
            o = row['o']
            triples_file.write(f"{subject}\t{relation}\t{o}\n")



In [None]:
all_df = load_ttl_files("/home/jovyan/work/Medicine_allTriples.ttl")
updated_kg, negative_df = has_hierarchy_associative_clash(all_df)
updated_kg, negative_df_s2 = has_hierarchy_associative_clash_s2(all_df)
e_to_id, rel_to_id = convert_to_id_files(all_df)
df_train, df_valid, df_test = split_dataset (updated_kg)



In [None]:
selected_columns = ["s", "o"]

# Extracting values from selected columns
selected_values = negative_df[selected_columns].values

# Creating a homogeneous list by flattening the array
homogeneous_list = selected_values.flatten().tolist()

# Creating a list with unique values
unique_values_list = list(set(homogeneous_list))



In [None]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.plugins.sparql import prepareQuery
data = []

g = rdflib.Graph()
g.parse("/home/jovyan/work/Medicine_allTriples.ttl")
# Define SKOS namespace
skos = Namespace("http://www.w3.org/2004/02/skos/core#")

for item in unique_values_list:

# SPARQL query to get skos:Concept and skos:prefLabel for the given concept URI
    sparql_query = """
    SELECT ?concept ?prefLabel
    WHERE {
        ?concept_uri a <http://www.w3.org/2004/02/skos/core#Concept> ;
                 <http://www.w3.org/2004/02/skos/core#prefLabel> ?prefLabel .
    }
"""
    results = g.query(sparql_query, initBindings={'concept_uri': item})



# the results
    for row in results:
        concept = item
        prefLabel = row["prefLabel"]
        data.append((concept,prefLabel))
df_med = pd.DataFrame(data, columns=['Id','STR'])

In [None]:
parsed_data_name = parsed_data_name.rename(columns={'Col_15': 'STR'})
parsed_data_name.head()

In [None]:
!pip install fuzzywuzzy

In [None]:
!pip install distance

In [None]:
from fuzzywuzzy import fuzz
from scipy.spatial.distance import cosine, jaccard
import distance

def evaluate_matching_methods(df1, df2):
    key_column_name_df1 = 'STR'
    key_column_name_df2 = 'STR'

    results = pd.DataFrame()

    # Compute Levenshtein Distance
    results['Levenshtein_Distance'] = df1.apply(lambda row1: df2[key_column_name_df2].apply(lambda row2: fuzz.ratio(row1[key_column_name_df1], row2)), axis=1)

    # Compute Jaro-Winkler
    results['Jaro_Winkler'] = df1.apply(lambda row1: df2[key_column_name_df2].apply(lambda row2: fuzz.jaro_winkler(row1[key_column_name_df1], row2)), axis=1)

    # Compute Jaccard Similarity using scipy
    tokenize = lambda text: set(text.lower().split())
    results['Jaccard_Similarity'] = df1.apply(lambda row1: df2[key_column_name_df2].apply(lambda row2: 1 - jaccard(tokenize(row1[key_column_name_df1]), tokenize(row2[key_column_name_df2]))), axis=1)

    # Compute Cosine Similarity using scipy
    vectors_df1 = df1[key_column_name_df1].apply(tokenize)
    vectors_df2 = df2[key_column_name_df2].apply(tokenize)
    results['Cosine_Similarity'] = df1.apply(lambda row1: df2.apply(lambda row2: 1 - cosine(vectors_df1[row1.name], vectors_df2[row2.name])), axis=1)

    return results

# usage:
evaluation_results = evaluate_matching_methods(parsed_data_name, df_med)
print("Evaluation Results:")
print(evaluation_results)
