In [83]:
import pandas as pd
import numpy as np
import random
import re


df = pd.read_csv('integrated_data/Fragebogen_integrated.csv')
types = pd.read_csv('integrated_data/labels_type_integrated.csv')
labels = pd.read_csv('integrated_data/labels_instances_integrated.csv')
df.drop(columns=['Unnamed: 0'], axis=1,inplace=True)
types.drop(columns=['Unnamed: 0'], axis=1,inplace=True)
labels.drop(columns=['Unnamed: 0'], axis=1,inplace=True)
types_dict = {key: list(value.values())[0] for key, value in types.to_dict().items()}
labels_dict = {key: list(value.values())[0] for key, value in labels.to_dict().items()}

def adjust_unchecked_and_textual_cols(df,types,labels):
    df = df.copy()
    for col in df.columns:
        if types.get(col) == 'F3' and labels.get(col) == {'Unchecked', 'Checked'}:  
            df[col] = df[col].fillna('Unchecked')
        if types.get(col) == 'A1000':
            df[col].fillna('Keine Angabe', inplace=True)
            df[col] = df[col].apply(lambda x: 'Keine Angabe' 
                            if isinstance(x, str) and (x.strip() == '' or re.match(r'^[^a-zA-Z0-9\s]+$', x)) 
                            else x)
    return df

def lowercase_dict_values(input_dict):
    result = {}
    for key, value in input_dict.items():
        try:
            evaluated_set = eval(value)
            if isinstance(evaluated_set, set):
                lowered_values = {item  for item in evaluated_set}
                result[key] = lowered_values
            else:
                raise ValueError("Value is not a set.")
        except Exception as e:
            print(f"Error processing key {key}: {e}")
            result[key] = value
    return result
labels_dict = lowercase_dict_values(labels_dict)
#df = adjust_unchecked_and_textual_cols(df,types_dict,labels_dict)
labels_dict["schlaf_2.65"] = labels_dict.pop("schlaf_2.66")
df.drop(['vorerkrankungen_other.71','schlaf_2.72'],axis=1,inplace=True)


In [84]:
import pandas as pd
import re

def clean_string(input_string):
    cleaned_string = re.sub(r'[^\w\s]', ' ', input_string) 
    cleaned_string = ' '.join(cleaned_string.split())
    return cleaned_string.strip()

def dice_similarity(str1, str2):
    str1 = clean_string(str1)
    str2 = clean_string(str2)
    set1 = set(str1.lower().split(' '))
    set2 = set(str2.lower().split(' '))
    intersection = len(set1 & set2)
    return (2 * intersection) / (len(set1) + len(set2)) if (len(set1) + len(set2)) > 0 else 0.0

def remove_duplicates_and_update_df(df, labels, threshold=0.65):
    updated_labels = {}
    replacement_map = {}
    for key, categories in labels.items():    
        unique_categories = []
        category_map = {}     
        for category in categories:
            found_similar = False
            for unique_category in unique_categories:                
                 if dice_similarity(category, unique_category) > threshold:
                    found_similar = True
                    category_map[category] = unique_category
                    break
            if not found_similar:
                unique_categories.append(category)
                category_map[category] = category
        
        updated_labels[key] = set(unique_categories)
        replacement_map[key] = category_map
    for key, mapping in replacement_map.items():
        if key in df.columns:
            df[key] = df[key].replace(mapping)
    
    return updated_labels, df
updated_labels, updated_df = remove_duplicates_and_update_df(df.copy(), labels_dict, threshold=0.75)

In [89]:
import uuid

def generate_unique_value(row, length):
    # Check if all values in the row are 'nan' and length matches
    if all(val == 'nan' for val in row) and len(row) == length:
        # Generate a random UUID
        return str(uuid.uuid4())
    return ''.join(row)  # Combine the row values if the condition is not met

def generate_unique_value(row, length):
    # Check if the value is entirely 'nan' repeated to the specified length
    if row == ('nan' * length):
        # Generate a random UUID
        return str(uuid.uuid4())
    return row
blockers = [col for col in updated_df.columns if col != 'source']
updated_df['blocker'] = (
        updated_df[blockers]
        .astype(str)
        .agg(lambda row: ''.join(val[0] for val in row), axis=1)  # Take first character
        .str.lower()
        .str.replace(' ', '', regex=True)
        .apply(generate_unique_value, args=(len(blockers),))
    )

In [90]:
updated_df

Unnamed: 0,long_covid_post_covid_patient_timestamp,alter,groe_e,geschlecht,impfung_01_wirkstoff,impfung_02_wirkstoff,impfung_03_wirkstoff,impfung_04_wirkstoff,impfung_01_charge,impfung_02_charge,...,wahr_2,lesewort_2,schlaf_2.65,sinn_2,sport_2,soz_2,long_covid_post_covid_patient_complete,menstruation,source,blocker
0,2022-06-13 11:19:01,29.0,175.0,Männlich,Johnson&Johnson,,,,XE395,,...,Ja,Nein,Ja,Nein,Extrem stark eingeschränkt,Eingeschränkt,Complete,Unchecked,Dataframe_2,221mjnnnxnnn2nnn1nuuuuuuuuuuuuuuuuuuuuuuuuuuuu...
1,2022-06-13 11:21:15,29.0,175.0,Männlich,Moderna,,,,3003606,,...,Nein,Ja,Ja,Nein,Extrem stark eingeschränkt,Eingeschränkt,Complete,Unchecked,Dataframe_2,221mmnnn3nnn2nnn1ncuuuuuuuuuuuuuuuuuuuuuuuuuuu...
2,2022-06-13 11:22:22,29.0,165.0,Weiblich,Biontech/Pfizer,Biontech/Pfizer,,,Scvc6,Scrp9,...,Ja,Nein,Nein,Nein,Eingeschränkt,Uneingeschränkt,Complete,Checked,Dataframe_2,221wbbnnssnn22nn1nuuuuuuuuuuuuuuuuuuuuuuuuuuuu...
3,2022-06-13 11:24:42,39.0,155.0,Weiblich,Biontech/Pfizer,Biontech/Pfizer,,,EW8904,1C008A,...,Ja,Ja,Nein,Ja,Extrem stark eingeschränkt,Eingeschränkt,Complete,Unchecked,Dataframe_2,231wbbnne1nn22nn1nuuuuuuuuuuuuuuuuuuuuuuuuuuuu...
4,2022-06-13 11:25:16,39.0,175.0,Weiblich,Biontech/Pfizer,Biontech/Pfizer,,,EX8680,FD7985,...,Ja,Nein,Ja,Ja,Extrem stark eingeschränkt,Eingeschränkt,Complete,Checked,Dataframe_2,231wbbnnefnn22nn1nuuuuuuuuuuuuuuuuuuuuuuuuuuuu...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5048,,78.0,194.0,Männlich,AstraZeneca,AstraZeneca,Moderna,Moderna,,,...,,,,,Eingeschränkt,Eingeschränkt,Complete,Unchecked,Dataframe_1,n71maammnnnn22221juuuuuuuuuuuuuuuuuuuuuuuuuuuu...
5049,,,158.0,Männlich,,,,,,,...,Nie,Manchmal,Nie,Nie,Uneingeschränkt,Uneingeschränkt,Complete,Unchecked,Dataframe_1,nn1mnnnnnnnnnnnn3nuuuuuuuucuuuuuuuuuuuuuuuuuuu...
5050,,,,,Biontech/Pfizer,Biontech/Pfizer,Biontech/Pfizer,,,,...,Manchmal,Manchmal,Nie,Nie,Eingeschränkt,Uneingeschränkt,Complete,Unchecked,Dataframe_1,nnnnbbbnnnnn222n1nuuuuuuuuuuuuuuuuuuuuuuuuuuuu...
5051,,,,,Biontech/Pfizer,Biontech/Pfizer,,,,,...,,,,,Extrem stark eingeschränkt,Extrem stark eingeschränkt,Complete,Unchecked,Dataframe_1,nnnnbbnnnnnn22nn1nuuuuuuuuuuuuuuuuuuuuuuuuuuuu...


In [111]:
def errors_first_col(df):
    nan_rows = df[df['long_covid_post_covid_patient_timestamp'].isna()]
    result_dict = {}
    for row_index in nan_rows.index:
        col_index = df.columns.get_loc('long_covid_post_covid_patient_timestamp')  
        result_dict[(row_index, col_index)] = 'N/A'
    return result_dict

def errors_second_third_col(df):
    columns = ['alter', 'groe_e']
    error_indices = {}
    for column in columns:
        col_idx = df.columns.get_loc(column)
        for idx in df[df[column].isna() | (df[column] == 0)].index:
                error_indices[(idx, col_idx)] = 'N/A' if pd.isna(df.at[idx, column]) else df.at[idx, column]
            
    return error_indices

def errors_categorical(df,labels):
    columns = ['geschlecht','grippeimpfung','allergien','darm_2',
       'muskelschm_2', 'konz_2', 'allg_krank_2', 'lagewechsel_2', 'kopf_2',
       'gelenk_2', 'wahr_2', 'lesewort_2', 'schlaf_2.65', 'sinn_2', 'sport_2',
       'soz_2', 'long_covid_post_covid_patient_complete']

    errors_dict = {}
    for col in columns:
        valid_values = labels.get(col, set())  
        col_idx = df.columns.get_loc(col)
        for idx, value in df[col].items():
            if pd.isna(value):
                errors_dict[(idx, col_idx)] = 'N/A'
            elif value.lower() not in [valid.lower() for valid in valid_values]:
                errors_dict[(idx, col_idx)] = value
    return errors_dict

def errors_menstruation(df):
    result_dict = {}
    nan_rows = df[df['menstruation'].isna()]
    for row_index in nan_rows.index:
        col_index = df.columns.get_loc('menstruation')
        result_dict[(row_index, col_index)] = 'N/A'
    
    invalid_rows = df[(df['menstruation'] == 'Checked') & (df['geschlecht'] == 'Männlich')]
    for row_index in invalid_rows.index:
        col_index = df.columns.get_loc('menstruation')  # Get the column index for 'menstruation'
        result_dict[(row_index, col_index)] = 'Checked'

    return result_dict

In [103]:
def is_valid_charge(charge):
    charge = str(charge).lower()
    if pd.isna(charge): 
        return False
    if charge.startswith('ch'):
        return True
    if any(c.isalpha() for c in charge) and any(c.isdigit() for c in charge):
        return True
    if charge.isdigit():
        return True
    return False

def impfung_error(df, col_number):
    wirkstoff_col = f'impfung_0{col_number}_wirkstoff'
    datum_col = f'impfung_0{col_number}_datum'
    charge_col = f'impfung_0{col_number}_charge'
    
    error_dict = {}
    valid_wirkstoffe = {'AstraZeneca',
  'Biontech/Pfizer',
  'Johnson&Johnson',
  'Moderna',
  'Novavax',
  }

    def get_value_or_na(value):
        return 'N/A' if pd.isna(value) else value

    
    for row_index, row in df.iterrows():
        
        wirkstoff = row[wirkstoff_col]
        datum = row[datum_col]
        charge = row[charge_col]
        if wirkstoff == 'keine':
            # Condition 1: datum is NaN and charge is valid
            if pd.isna(datum) and is_valid_charge(charge):
                
                error_dict[(row_index, df.columns.get_loc(datum_col))] = get_value_or_na(datum)
                error_dict[(row_index, df.columns.get_loc(wirkstoff_col))] = get_value_or_na(wirkstoff)

            # Condition 2: datum is not NaN and charge is not valid
            elif not pd.isna(datum) and not is_valid_charge(charge):
                error_dict[(row_index, df.columns.get_loc(wirkstoff_col))] = get_value_or_na(wirkstoff)
                error_dict[(row_index, df.columns.get_loc(charge_col))] = get_value_or_na(charge)

            # Condition 3: datum is not NaN and charge is valid
            elif not pd.isna(datum) and is_valid_charge(charge):
                error_dict[(row_index, df.columns.get_loc(wirkstoff_col))] = get_value_or_na(wirkstoff)

        if wirkstoff in valid_wirkstoffe:
            # Condition 1: datum is NaN and charge is valid
            if pd.isna(datum) and not is_valid_charge(charge):
                error_dict[(row_index, df.columns.get_loc(datum_col))] = get_value_or_na(datum)
                error_dict[(row_index, df.columns.get_loc(charge_col))] = get_value_or_na(charge)
                
            elif pd.isna(datum) and is_valid_charge(charge):
                error_dict[(row_index, df.columns.get_loc(datum_col))] = get_value_or_na(datum)

            # Condition 2: datum is NaN and charge is not valid
            elif pd.isna(datum) and not is_valid_charge(charge):
                error_dict[(row_index, df.columns.get_loc(datum_col))] = get_value_or_na(datum)
                error_dict[(row_index, df.columns.get_loc(charge_col))] = get_value_or_na(charge)

            # Condition 3: datum is not NaN and charge is not valid
            elif not pd.isna(datum) and not is_valid_charge(charge):
                error_dict[(row_index, df.columns.get_loc(charge_col))] = get_value_or_na(charge)

        if pd.isna(wirkstoff):
            error_dict[(row_index, df.columns.get_loc(wirkstoff_col))] = get_value_or_na(wirkstoff)

            # Condition 2: datum is NaN and charge is valid
            if pd.isna(datum) and is_valid_charge(charge):
                #error_dict[(row_index, df.columns.get_loc(wirkstoff_col))] = get_value_or_na(wirkstoff)
                error_dict[(row_index, df.columns.get_loc(datum_col))] = get_value_or_na(datum)

            # Condition 3: datum is not NaN and charge is invalid
            elif not pd.isna(datum) and not is_valid_charge(charge):
                #error_dict[(row_index, df.columns.get_loc(wirkstoff_col))] = get_value_or_na(wirkstoff)
                error_dict[(row_index, df.columns.get_loc(charge_col))] = get_value_or_na(charge)
                error_dict[(row_index, df.columns.get_loc(datum_col))] = get_value_or_na(datum)

        

    return error_dict


def all_impfung_error(df):
    errors_dict = {}
    for i in range(4):
        errors_dict.update(impfung_error(df,i+1))
    return errors_dict

def is_valid_date(value):
    try:
        date = pd.to_datetime(value, format='%Y-%m-%d', errors='raise')
        return 2020 <= date.year <= 2024
    except ValueError:
        return False


def check_vaccination_dates(df):
    # Columns to check
    columns = ['impfung_01_datum', 'impfung_02_datum', 'impfung_03_datum', 'impfung_04_datum']
    error_dict = {}

  
    for row_index, row in df.iterrows():
        dates = {}
        for col in columns:
            value = row[col]
            
            if not pd.isna(value):
                if is_valid_date(value):
                    dates[col] = pd.to_datetime(value, format='%Y-%m-%d')
                else:
                    error_dict[(row_index, df.columns.get_loc(col))] = value

        date_list = list(dates.items())
        for i, (col1, date1) in enumerate(date_list):
            for j, (col2, date2) in enumerate(date_list):
                if i < j and date1 >= date2:
                    error_dict[(row_index, df.columns.get_loc(col2))] = row[col2]
                
    return error_dict

In [104]:
def detect_error_textual(df):
    error_dict = {}
    special_char_pattern = re.compile(r'[^a-zA-ZäöüÄÖÜß0-9,\s]')  # Matches special characters except comma
    und_pattern = re.compile(r'\bund\b', re.IGNORECASE)  # Matches 'und' as a standalone word
    mit_pattern = re.compile(r'\bmit\b', re.IGNORECASE)
    columns = ['vorerkrankungen_other.47','allergien_other', 'nahrung_others']
    
    for col in columns:
        if col in df.columns:
            col_idx = df.columns.get_loc(col)  # Get column index
            for index, value in df[col].astype(str).items():
                if special_char_pattern.search(value) or und_pattern.search(value) or mit_pattern.search(value):
                    error_dict[(index, col_idx)] = value
    
    return error_dict
    

In [105]:
errors = {}
errors.update(errors_first_col(updated_df))
errors.update(errors_second_third_col(updated_df))
errors.update(errors_categorical(updated_df,updated_labels))
errors.update(errors_menstruation(updated_df))
errors.update(all_impfung_error(updated_df))
errors.update(check_vaccination_dates(updated_df))
errors.update(detect_error_textual(updated_df))

In [106]:
from collections import Counter
row_error_counts = Counter(row for (row, col) in errors)
all_rows = set(df.index)
rows_with_errors = set(row_error_counts.keys())
rows_with_zero_errors = all_rows - rows_with_errors
for row in rows_with_zero_errors:
    row_error_counts[row] = 0


sorted_rows = sorted(row_error_counts.items(), key=lambda x: x[1])
cleanest_50_rows = [row for row, count in sorted_rows[:443]]
cleanest_df = df.loc[cleanest_50_rows].reset_index(drop=True)
cleanest_df

Unnamed: 0,long_covid_post_covid_patient_timestamp,alter,groe_e,geschlecht,impfung_01_wirkstoff,impfung_02_wirkstoff,impfung_03_wirkstoff,impfung_04_wirkstoff,impfung_01_charge,impfung_02_charge,...,gelenk_2,wahr_2,lesewort_2,schlaf_2.65,sinn_2,sport_2,soz_2,long_covid_post_covid_patient_complete,menstruation,source
0,2022-06-24 23:26:51,49.0,155.0,weiblich,keine,keine,keine,keine,Keine Angabe,Keine Angabe,...,Ja,Ja,Ja,Ja,Ja,Eingeschränkt,Uneingeschränkt,Complete,Checked,Dataframe_2
1,2022-06-13 11:25:29,39.0,165.0,weiblich,Biontech/Pfizer,keine,keine,keine,FE6975,Keine Angabe,...,Ja,Ja,Ja,Ja,Ja,Extrem stark eingeschränkt,Extrem stark eingeschränkt,Complete,Checked,Dataframe_2
2,2022-06-13 11:28:16,39.0,175.0,weiblich,Biontech/Pfizer,Biontech/Pfizer,Moderna,keine,FD9234,FC1440,...,Nein,Ja,Ja,Nein,Ja,Eingeschränkt,Eingeschränkt,Complete,Unchecked,Dataframe_2
3,2022-06-25 11:36:16,29.0,175.0,weiblich,Moderna,Moderna,keine,keine,3002186,3002918,...,Ja,Ja,Ja,Ja,Ja,Extrem stark eingeschränkt,Extrem stark eingeschränkt,Complete,Unchecked,Dataframe_2
4,2022-06-13 11:36:43,49.0,155.0,weiblich,Biontech/Pfizer,Biontech/Pfizer,keine,keine,SCTN4,FH9678,...,Ja,Ja,Ja,Nein,Ja,Eingeschränkt,Uneingeschränkt,Complete,Checked,Dataframe_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,2022-06-24 18:09:48,29.0,165.0,weiblich,keine,keine,keine,keine,Keine Angabe,Keine Angabe,...,Nein,Ja,Ja,Ja,Ja,Eingeschränkt,Eingeschränkt,Complete,Checked,Dataframe_2
439,2022-06-24 19:44:16,29.0,185.0,Männlich,Biontech/Pfizer,Biontech/Pfizer,keine,keine,EX 7823,FC3095,...,Nein,Nein,Nein,Nein,Nein,Eingeschränkt,Uneingeschränkt,Complete,Unchecked,Dataframe_2
440,2022-06-24 20:19:27,59.0,175.0,Männlich,keine,keine,keine,keine,Keine Angabe,Keine Angabe,...,Nein,Nein,Nein,Nein,Nein,Uneingeschränkt,Uneingeschränkt,Complete,Unchecked,Dataframe_2
441,[not completed],49.0,175.0,weiblich,keine,keine,keine,keine,Keine Angabe,Keine Angabe,...,Nein,Nein,Nein,Nein,Nein,Uneingeschränkt,Uneingeschränkt,Incomplete,Unchecked,Dataframe_2


In [107]:

errors_cleanest = {}
errors_cleanest.update(errors_first_col(cleanest_df))
errors_cleanest.update(errors_second_third_col(cleanest_df))
errors_cleanest.update(errors_categorical(cleanest_df,updated_labels))
errors_cleanest.update(errors_menstruation(cleanest_df))
errors_cleanest.update(all_impfung_error(cleanest_df))
errors_cleanest.update(check_vaccination_dates(cleanest_df))
errors_cleanest.update(detect_error_textual(cleanest_df))
errors_cleanest

{}

In [110]:
cleanest_df.shape

(443, 72)

# export clean data

In [109]:
cleanest_df.to_csv('clean.csv',index=False)

In [122]:

dirty = pd.read_csv('dirty.csv')
errors_dirty = {}
errors_dirty.update(errors_first_col(dirty))
errors_dirty.update(errors_second_third_col(dirty))
errors_dirty.update(errors_categorical(dirty,updated_labels))
errors_dirty.update(errors_menstruation(dirty))
errors_dirty.update(all_impfung_error(dirty))
errors_dirty.update(check_vaccination_dates(dirty))
errors_dirty.update(detect_error_textual(dirty))

In [ ]:
# values of our mimir evaluation
Precision = 0.72
Recall = 0.70
F1 = 0.71


Precision = 0.74
Recall = 0.72
F1 = 0.73


Precision = 0.76
Recall = 0.73
F1 = 0.75
