In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

In [None]:
hospitalizations = pd.read_csv("Hospitalization.csv")
operations = pd.read_csv("Operations.csv")

In [None]:
hospitalizations = hospitalizations.fillna('')
hospitalizations

In [None]:
operations = operations.fillna('')
operations

In [None]:
# Check if Hospitalization has multiple related Operations
hospitalization_operations_count = operations['hosp_id'].value_counts()
multiple_operations_for_hospitalization = hospitalization_operations_count[hospitalization_operations_count > 1]

if not multiple_operations_for_hospitalization.empty:
    print("Some hospitalizations have multiple related operations.")
    print(multiple_operations_for_hospitalization)
else:
    print("Each hospitalization has at most one related operation.")

# Check if Operations have multiple related Hospitalizations
operations_hospitalization_count = hospitalizations['hosp_id'].isin(operations['hosp_id']).sum()
if operations_hospitalization_count > len(operations):
    print("Some operations are related to multiple hospitalizations.")
else:
    print("Each operation is related to at most one hospitalization.")

In [None]:
# Merge the DataFrames on hospitalization_id
merged_df = pd.merge(hospitalizations, operations[['hosp_id', 'description', 'oper_proc']], on='hosp_id', how='left')

merged_df

In [None]:
duplicate_count = merged_df['hosp_id'].value_counts()
duplicate_count

In [None]:
# Count how many hosp_id values have more than one related operation
multiple_operations_count = (merged_df['hosp_id'].value_counts() > 1).sum()
multiple_operations_count

In [None]:
# Merge the DataFrames on hospitalization_id using a left join
merged_df = pd.merge(hospitalizations, operations[['hosp_id', 'description', 'oper_proc']], on='hosp_id', how='left')

# Group by hosp_id and aggregate descriptions and oper_proc into lists
grouped_df = merged_df.groupby('hosp_id').agg({'description': list, 'oper_proc': list}).reset_index()

# Merge the aggregated data back to the hospitalization DataFrame
result_df = pd.merge(hospitalizations, grouped_df, on='hosp_id', how='left')
result_df

In [None]:
result_df.at[140, "description"]

In [None]:
len(result_df.at[140, "description"])

In [None]:
test = result_df
# Function to format pairs of elements
def format_pairs(row):
    if any(pd.isna(elem) for elem in row['oper_proc']):
        # Handle nan values by returning an empty string
        return ''
    pairs = [f"Procedura: ${elem1}, Popis operace: ${elem2}" for elem1, elem2 in zip(row['oper_proc'], row['description'])]
    return ', '.join(pairs)

# Apply the function to create a new column
test['MergedColumn'] = test.apply(format_pairs, axis=1)
test

In [None]:
non_empty_rows = test[test['MergedColumn'] != '']
non_empty_rows

In [None]:
non_empty_rows.at[168, "MergedColumn"]

In [None]:
pre_merge_df = test.drop(["description", "oper_proc", "hosp_id"], axis=1)
pre_merge_df

In [None]:
# Function to merge columns
def merge_columns(row):
    merged_text = []
    if row['adm_cur_problems']:
        merged_text.append(f"Problémy pacienta při příjetí: {row['adm_cur_problems']}")
    if row['adm_findings']:
        merged_text.append(f"Nálezy při přijetí: {row['adm_findings']}")
    if row['adm_conclusion']:
        merged_text.append(f"Závěr při přijetí: {row['adm_conclusion']}")
    if row['dis_hosp_reason']:
        merged_text.append(f"Důvod hospitalizace: {row['dis_hosp_reason']}")
    if row['dis_opers']:
        merged_text.append(f"Operace při propuštění: {row['dis_opers']}")
    if row['dis_exams']:
        merged_text.append(f"Testy při propuštění: {row['dis_exams']}")
    if row['MergedColumn']:
        merged_text.append(f"Provedené operace: {row['MergedColumn']}")
    return ', '.join(merged_text)

# Apply the function to create a new column
pre_merge_df['merged_text'] = pre_merge_df.apply(merge_columns, axis=1)

In [None]:
pre_merge_df

In [None]:
limited_df = pre_merge_df[["merged_text", "dis_hosp_summary"]]
limited_df

In [None]:
# Dictionary to map old column names to new names
column_mapping = {'merged_text': 'information', 'dis_hosp_summary': 'summary'}

# Use the rename method to rename columns
final_df = limited_df.rename(columns=column_mapping)
final_df

In [None]:
final_df.to_csv('preprocessed_data.csv', index=False)