In [None]:
import pandas as pd
import os 
import numpy as np
import regex

In [None]:
# read data
data_path = os.path.join('..', 'data', "Louise-Prep-Data-SaraOgMonopolet.json")
df = pd.read_json(data_path)

# remove row 2, iloc 1 (solutions do not match the dilemma)
df = df.drop(df.index[1])

We need to do some data cleaning. Some solutions contain jokes, which is specified in the text. Others contain strange metadata, such as 'Deltagere: {guest_sender}' etc. We use regex to clean these:

In [None]:
def remove_jokes(df):
    
    jokey_phrases = ['humoristisk', 'joke', 'vits', 'for sjov', 'ironi', 'ironisk', 'sarkasme', 'sarkastisk', 'useriøs']
    pattern = '|'.join(jokey_phrases)

    df_exploded = df.explode('acceptable_solutions')
    
    jokey_rows_raw_text = df_exploded[
        df_exploded['acceptable_solutions']
        .str.contains(pattern, case=False, na=False)
    ]

    # save to csv for manual inspection
    #jokey_rows_raw_text['acceptable_solutions'].to_csv('raw_text_jokes.csv')    

    # after manual inspection, we've now identified the rows which contains joke answers (so actual joke-solutions, not just solutions mentioning a humoristic approach)
    indices_to_clean = [107, 544, 685, 698, 1259, 1366, 1592, 1785, 1856, 1856, 2193, 2608, 2654, 2731, 2970, 3013, 73, 499, 1447, 1502, 1744, 1909, 2226, 2234, 2399]

    # find the solutions to move in the dataframe containing all rows containing 'jokey' language
    solutions_to_remove = jokey_rows_raw_text['acceptable_solutions'].loc[jokey_rows_raw_text.index.isin(indices_to_clean)]

    # these position shares index labels with solutions that should be removed; these ones are NOT jokes however.
    remove_positions = [8, 15, 16, 21, 25]

    # need to do some manual cleanign with these
    
    # we can't remove by index, as there are several solutions at each row/index; instead, remove by text matching:
    rows_to_remove = solutions_to_remove.iloc[remove_positions]
    #print(rows_to_remove)  # just to see them

    mask = np.ones(len(solutions_to_remove), dtype=bool) # start with all True
    mask[remove_positions] = False # set positions to remove to False

    solutions_to_remove_final = solutions_to_remove[mask] # <--- we need to remove these rows from the 'accepted_solutions' column

    solutions_all = []
    for i, row in df.iterrows():
        solutions = row['acceptable_solutions']
        
        # make sure it's removing the correct rows
        to_remove = [s for s in solutions if s in solutions_to_remove_final.tolist()]
        if to_remove:
            print(f"Removing {to_remove} from row index {row.name}")
        
        solutions_cleaned = [s for s in solutions if s not in solutions_to_remove_final.tolist()]
        solutions_all.append(solutions_cleaned)
    
    return solutions_all

In [None]:
def clean_solutions(lst):
    # Compile regexes once
    deltagere_pattern = re.compile(r'deltager(?:e|ne)?(?:/\w+)?\s*:.*$', flags=re.IGNORECASE)
    meta_labels = ['Orden', 'Betingelse', 'Rækkefølge', 'Aktør', 'Kræver', 'Ressourcer']
    meta_pattern = re.compile(r'(' + '|'.join(meta_labels) + r')\s*:.*$', flags=re.IGNORECASE)

    cleaned = []
    for s in lst:
        s = s.strip()
        # 0. Remove text in parentheses first
        s = re.sub(r'\s*\([^)]*\)', '', s).strip()
        # 1. Remove deltager meta-comments
        s = deltagere_pattern.sub('', s).strip()
        # 2. Remove other meta-comments
        s = meta_pattern.sub('', s).strip()
        # 3. Replace trailing weird punctuation with a period, but keep ., ?, !
        if s and s[-1] not in {'.', '?', '!'}:
            s = re.sub(r'[\W_]+$', '.', s)
        cleaned.append(s)
    return cleaned

In [None]:
# remove jokes:
df['acceptable_solutions_cleaned'] = remove_jokes(df)

# remove meta-comments:
df['acceptable_solutions_cleaned'] = df['acceptable_solutions_cleaned'].apply(clean_solutions)

In [None]:
# export to csv
df_cleaned = df.drop(['acceptable_solutions'], axis=1)

df_cleaned.to_csv('../data/monopolet_cleaned.csv')