In [1]:
import os
import re
import numpy as np
import pandas as pd
import warnings

In [2]:
def get_num_from_string(input_string):
    """
    Get digits from string and join

    Parameters:
    -----------
    input_string:   str
        string to extract numbers from

    Returns
    -------
     integer representing joined digits
     extracted from string
    """
    try:
        my_list = [s for s in input_string if s.isdigit()]
    except TypeError:
        return np.nan
    return float(''.join(my_list))
    
def remove_unwanted_char(input_str):
    """
    People enter various different 
    characters when they are meant to
    enter only numbers.
    Hence, this function ;)

    Parameters
    ----------
    input_str:  str
        string to process
    
    Returns
        either nan if TypeError encountered
        (input not string) or the input with
        characters removed as a floating point
        number.
    """
    try:
        new_val = ''.join(re.findall(r"\d+",input_str))
    except TypeError:
        return np.nan
    return float(new_val)

specify filepaths and read in the files we need.

In [3]:
randomisation_dir = r"P:\Spironolactone\screening_randomization"
main_qualtrics_dir = r"P:\Spironolactone\main_qualtrics"

# output directory
output_dir = os.path.join(randomisation_dir,"valid_screening_records")
try:
    os.mkdir(output_dir)
except OSError:
    # if directory already exists
    warnings.warn("Directory already exists. Files may be overwritten. Manual check advised.")


# cols we want from screening
BDI_cols = [''.join(["Q",str(num)]) for num in np.arange(72,93)]
BDI_cols.extend(["Telephone number"])

# read in the dataframes
# study day qualtrics
qualtrics_df = pd.read_csv(
    os.path.join(main_qualtrics_dir,"main_dat21.csv"),
    usecols = ["DQ-1"],skiprows = [1,2]
    )

# randomisation
randomisation_df = pd.read_csv(
    os.path.join(randomisation_dir,"randomisation.csv")
    )

# screening - contains BDI score
screening_df = pd.read_csv(
    os.path.join(randomisation_dir,"online_screening.csv"),
    usecols = BDI_cols, skiprows = [1,2]
    )

# just for convenience, rename column that links
# randomisation and screening dfs.
screening_df = screening_df.rename(
    {"Telephone number":"Screening ID"},axis = 1
    )

# convert screening ID to number
# removes the two initials at the start
randomisation_df["Screening ID"] = randomisation_df["Screening ID"].apply(
    lambda x: get_num_from_string(x)
    )

# filter out people who haven't been assigned a study day date 
# and who have no check against the day 8 questionnaire
randomisation_df = randomisation_df[
    randomisation_df["Study day complete /date"].notnull()
    & randomisation_df["Check.7"].notnull()
    ]

# I was pleased to find that people have not stopped
# entering ALL SORTS of stuff when they should only
# enter a number...hence I needed to remove some special characters.
screening_df["Screening ID"] = screening_df["Screening ID"].apply(
    lambda x: remove_unwanted_char(x)
    )

# Finally, filter for screening records with a screening id that also
# exists in the randomisation file.
valid_participants_df = screening_df[
    screening_df["Screening ID"].isin(randomisation_df["Screening ID"])
    ]

print(f"Found {valid_participants_df.shape[0]} records in total.")

num_duplicates = (valid_participants_df.shape[0] - 
    valid_participants_df["Screening ID"].unique().shape[0]
    )

valid_participants_df.to_csv(os.path.join(output_dir,"valid_participants.csv"),index = False)

print(f"Number of duplicate screening IDs in records is {num_duplicates}.\n")

Found 49 records in total.
Number of duplicate screening IDs in records is 3.



