*__Note:__* Filepaths should be changed as necessary

In [None]:
import pandas as pd
import os
import numpy as np
import re
from itertools import compress
from IPython.display import display
from functools import reduce
from math import floor
from statistics import mode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import subprocess
import sys
import time
from sklearn.preprocessing import OneHotEncoder
try:
    import fancyimpute
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", 'fancyimpute'])
finally:
    import fancyimpute
from fancyimpute import IterativeImputer

nltk.download('punkt')
nltk.download('wordnet')



In [2]:
# Step 1: Merge 48 studies (*.csv) from the /DATA/TEDDY
# Input: file_directory = "/home/NIDDK/DATA/TEDDY/"
# Output: data frame (817 rows stand for 817 participants, *** columns stand for the tranformed columns from 48 studies)

#calculate % completeness of column
#missing value check
def missing_value(val):
    if isinstance(val, str):
        val = val.lower()
        return val != val or val == "" or val == "null" or val == "" or val == "nan" or val == "not reported"
    else:
        return val != val or np.isnan(val)

def calculate_completeness(value_list):
    return(sum([not(missing_value(val)) for val in value_list])/len(value_list))

def remove_columns_low_representation(dataframe, threshold):
    column_list = list(dataframe.columns)
    column_list = column_list[1:len(column_list)]
    completeness_list = [calculate_completeness(dataframe[column]) for column in column_list]
    completeness_dict = dict(zip(column_list, completeness_list))
    for column in column_list:
        if completeness_dict[column] < threshold:
            column_list.remove(column)
    column_list.insert(0, "MaskID")
    return dataframe.loc[:, column_list]

def generate_split_frame(dataframe, key_column, split_column, split_value):
    dataframe = dataframe.drop(split_column, axis = 1)
    column_list = list(dataframe.columns)
    column_list.pop(0)
    modified_column_list = ["observation_" + str(split_value) + "_" + column \
                           for column in column_list]
    dataframe.columns = [key_column] + modified_column_list
    return dataframe

#used to mark the observation # of each record in files which have multiple records per case (different timepoints)
def number_of_appearances(element_list):
    element_count_list = []
    appearance_dict = {}
    for element in element_list:
        if element in appearance_dict.keys():
            count = appearance_dict[element] + 1
            appearance_dict[element] = count
            element_count_list = element_count_list + [count]
        else:
            appearance_dict[element] = 1
            element_count_list = element_count_list + [1]
    return element_count_list

#flattens a frame with multiple records per case after sorting by the sort_column, assigning observation #s
#and merging the observation # based frames
def sort_and_merge_by_observation(dataframe, sort_column = None):
    if not sort_column is None:
        dataframe.sort_values(sort_column, inplace = True)
    dataframe.loc[:,"to_split"] = number_of_appearances(dataframe["MaskID"])
    max_split = max(dataframe["to_split"])
    return reduce(lambda left, right: pd.merge(left, right, on = "MaskID",how = "outer"), \
                 [generate_split_frame(dataframe[dataframe["to_split"] == i], \
                                      "MaskID", "to_split", i) for i in range(1, max_split + 1)])

def split_and_merge_by_test_type(dataframe):
    distinct_test_types = np.unique(dataframe["TEST_NAME"])
    return reduce(lambda left, right: pd.merge(left, right, on = "MaskID",how = "outer"), \
                 [generate_split_frame_test_type(dataframe[dataframe["TEST_NAME"] == name], \
                                    "MaskID", "TEST_NAME", name) for name in distinct_test_types])

def generate_split_frame_test_type(dataframe, key_column, split_column, split_value):
    categorical_test_names = ["CTLA1", "CTLA2", "DPA1_Allele1", "DBA1_Allele2","DPB1_Allele1" "DPB1_Allele2", "DQA1", "DQA2", "DQB1", "DQB2", "DR1", "DR2", \
                         "DRB3_Allele1", "DRB3_Allele2", "DRB4_Allele1", "DRB4_Allele2", "DRB5_Allele1", "DRB5_Allele2", \
                          "INS1", "INS2", "PTPN221", "PTPN222", "HLA-B_Allele1", "HLA-B_Allele2"]
    if(split_value not in categorical_test_names):
        dataframe.loc[:,"RESULT"] = pd.to_numeric(dataframe.loc[:,"RESULT"], errors = 'coerce')
    split_value = re.sub("_", '-', str(split_value))
    dataframe = dataframe.drop(split_column, axis = 1)
    column_list = list(dataframe.columns)
    column_list.pop(0)
    modified_column_list = [split_value + "_" + column \
                           for column in column_list]
    dataframe.columns = [key_column] + modified_column_list
    
    case_list = np.unique(dataframe["MaskID"])
    if len(case_list) == len(dataframe):
        return dataframe
    return sort_and_merge_by_observation(dataframe, split_value + '_' + "DRAW_AGE")

                                 
def prepend_file_name(file, column_list):
    file_name = re.sub(".*/(.*)\.csv", r"\g<1>", file, flags = re.IGNORECASE)

    #Add file name to every column aside from the MaskID
    column_list[1:len(column_list)] = [file_name + '_' + column for column in column_list[1:len(column_list)]]
    return column_list

def retrieve_timepoint_variable(column_list):
    if "EVENT_AGE" in column_list:
        return "EVENT_AGE"
    elif "EFFECTIVE_AGE" in column_list:
        return "EFFECTIVE_AGE"
    elif "visit" in column_list:
        return "visit"
    elif "Evaluate_age" in column_list:
        return "Evaluate_Age"
    return False

def retrieve_sample_variable(column_list):
    if "sample_mask_id" in column_list:
        return "sample_mask_id"
    elif "SampleMaskID" in column_list:
        return "SampleMaskID"
    return False

def read_teddy_file_create_observations(file):
    base_frame = pd.read_csv(file, low_memory = False)
    
    #make MaskID a string
    base_frame["MaskID"] = base_frame["MaskID"].astype(str)
    
    #if the file is the "TEST_RESULTS" file, it needs to be processed differently
    if re.search("TEST_RESULTS", file) != None:
        return_frame = split_and_merge_by_test_type(base_frame)
        return_frame.columns = prepend_file_name(file, list(return_frame.columns))
        return return_frame
    
    #remove columns where 95+% of values are missing
    base_frame = remove_columns_low_representation(base_frame, .05)
    
    column_list = list(base_frame.columns)
    
    #determine if there are duplciate cases
    cases = base_frame["MaskID"]
    row_count = len(base_frame)
    number_unique_cases = len(np.unique(cases))
    duplicate_cases = row_count != number_unique_cases
    
    #if there are no duplicates, return the frame
    if not duplicate_cases:
        base_frame.columns = prepend_file_name(file, column_list)
        return base_frame
    
    #if there are, identify a column that defines the observation
    #SampleMaskID is treated like a timepoint as all IDs are integers.
    timepoint_column = retrieve_timepoint_variable(column_list)
    sample_column = retrieve_sample_variable(column_list)
    
    
    if (sample_column and timepoint_column) or ((not timepoint_column) and (not sample_column)):
        return_frame = sort_and_merge_by_observation(base_frame)
     
    if sample_column:
        #placeholder in case there are other columns that can be used for sorting
        return_frame = sort_and_merge_by_observation(base_frame, sample_column)
        
    elif timepoint_column:
        return_frame = sort_and_merge_by_observation(base_frame, timepoint_column)

    #add the file name to the beginning of each column name
    return_frame.columns = prepend_file_name(file, list(return_frame.columns))
    return return_frame

def get_timepoint(dataframe_columns, file):
    #first, filter the columns by the file
    pattern = ("^" + file + "_")
    pattern_filter = [(re.search(pattern, column) != None) for column in dataframe_columns]
    pattern_filter = [int(x) for x in pattern_filter]
    dataframe_columns = list(compress(dataframe_columns, pattern_filter))
    
    #look for which column is present in the column list, with a preference for "EVENT_AGE", and so on
    if sum([re.search("EVENT_AGE", column) != None for column in dataframe_columns]) > 0:
        return "EVENT_AGE"
    elif sum([re.search("visit", column) != None for column in dataframe_columns]) > 0:
        return "visit"
    elif sum([re.search("EFFECTIVE_AGE", column) != None for column in dataframe_columns]) > 0:
        return "EFFECTIVE_AGE"
    elif sum([re.search("DRAW_AGE", column) != None for column in dataframe_columns]) > 0:
        return "DRAW_AGE"
    return None

#function for getting the mode disregaring missing values. The first value is taken in case of a tie
def get_mode(value_list):
    value_list = list(compress(value_list, [np.invert(missing_value(val)) for val in value_list]))
    return mode(value_list)

#function for flattening nested lists into a single list
def flatten(list_of_lists):
    if not isinstance(list_of_lists, list):
        yield list_of_lists
    else:
        for entry in list_of_lists:
            yield from flatten(entry)

#calculates the boundries for the bins
def divide_by_bins(start, end, bins):
    time_range = end - start
    bin_length = time_range/bins
    return [start + x * bin_length for x in range(0, bins + 1)]

#assigns a bin based on the list of boundries. NA timepoints are assigned the first bin
def assign_bin(value_list, bin_list):
    bin_value_list = []
    for value in value_list:
        for i in range(0, len(bin_list) - 1):
            if value != value or value <= bin_list[i + 1]:
                bin_value_list.append(i + 1)
                break
    return bin_value_list

#function for determining which bin (which has data) is closest to the current bin
def closest_bin_with_data(bin_, bin_timepoint, bins_with_data, bin_timepoint_dict):
    lowest_bin_number = None
    current_distance = None
    for bin_num in bins_with_data:
        if len(bin_timepoint_dict[bin_num]) == 0 or sum([missing_value(val) for val in bin_timepoint_dict[bin_num]]) == len(bin_timepoint_dict[bin_num]):
            continue
        distance = abs(np.nanmean(bin_timepoint_dict[bin_num]) - bin_timepoint)
        if current_distance == None or current_distance > distance:
            current_distance = distance
            lowest_bin_number = bin_num
    return lowest_bin_number

def gather_observations_and_bin(dataframe, timepoint_column, max_bins, name):
    ### Input: A Dataframe, the timepoint, the manually-set max # of bins and the file name of the original study
    ### Output: A Dataframe containing the binned_columns to be merged to the original frame
    
    #add the "_" to the file name
    name = name + "_"
    
    #remove the file name from the column list
    column_list = list(dataframe.columns)
    column_list = [re.sub(name, '', column) for column in column_list]
    dataframe.columns = column_list
    #isolate the columns without observations this indicates a subsection of the study with one observation per case
    columns_no_observations = list(compress(column_list, \
                                           [re.search("observation_[0-9]+_", column) == None for column in column_list]))
    
    column_list = list(compress(column_list, \
                               [re.search("observation_[0-9]+_", column) != None for column in column_list]))
    
    
    
    #remove MaskID from the column list, as we will not manipulate that
    #column_list = column_list[1:len(column_list)]
    
    #get the unique observation #s for this particular study
    observation_list = np.unique([re.search("observation_[0-9]+_", column)[0] for column in column_list])
    
    #get the list of cases for this study
    case_list = dataframe["MaskID"]
    
    #get the list of column types; the type of column observation_3_a would be a
    column_type_list = np.unique([re.sub("observation_[0-9]+_", '', column) for column in column_list])
    
    
        #get a flat list of timepoints across all observations
    timepoint_set = list(compress(column_list, \
                                    [re.search(timepoint_column, column) != None for column in column_list]))
    
    timepoint_lists = [(list(dataframe.loc[:,timepoint_set].iloc[i])) for i in range(0, len(dataframe))]
    timepoint_list = list(flatten(timepoint_lists))
    if name == "TEST_RESULTS_":
        number_of_bins = max_bins
    else:
        #create a list of the number of observations per row
        observation_completeness_list = [len(observation_list) - sum(pd.isna(list(dataframe.loc[:,timepoint_set].iloc[i]))) for i in range(0, len(dataframe))]
        observation_completeness_list = list(compress(observation_completeness_list, [completeness != 0  for completeness in observation_completeness_list]))
        #set the number of bins to either the median number of observations per row with NAs removed, or the user set max
        number_of_bins = min(max_bins, int(np.nanmedian(observation_completeness_list)))
    
    #create the timepoint dividers. There will be # of bins _ 1 dividers, with the earliest timepoint representing the "start"
    #every timepoint thereafter will be i * (latest timepoint - earliest timepoint)/i
    bin_list = divide_by_bins(np.nanmin(timepoint_list), np.nanmax(timepoint_list), number_of_bins)
    
    bin_numbers = range(1, number_of_bins + 1)
    bin_strings = ["bin_" + str(bin_number) for bin_number in bin_numbers]
    
    #create a dictionary of the mean of the border for each timepoint to reference later
    bin_time_dict = {}
    for i in range(0, len(bin_list) - 1):
        bin_time_dict["bin_" + str(i + 1)] = np.nanmean([bin_list[i], bin_list[i + 1]])
    
    #create a dictionary of various properties of each colum type to reference later
    column_type_dict = {}
    
 
    for column_type in column_type_list:
        subdict = {}
        subdict["column_subset"] = list(compress(column_list, \
                                         [re.search((column_type + "$"), column) != None for column in column_list]))
        subdict["observation_subset"] = np.unique([re.search("observation_[0-9]+_", column)[0] for column in subdict["column_subset"]])
        subdict["timepoint_subset"] = list(compress(subdict["column_subset"], \
                                            [(re.search(timepoint_column, column) != None) for column in subdict["column_subset"]]))
        column_type_dict[column_type] = subdict
        
    #create an empty list to store the dicts generated below
    dict_list = []
    #for every row
    for i in range(0, len(dataframe)):
        #get the MaskID
        newdf={"MaskID": case_list[i]}
        for column_type in column_type_list:
            #get the subset of columns that match that type
            column_subset = column_type_dict[column_type]["column_subset"]
            timepoint_subset = column_type_dict[column_type]["timepoint_subset"]
            #make a list of the data
            data_list = list(dataframe.loc[:,column_subset].iloc[i])
            #make a list of the timepoints
            timepoint_list = list(dataframe.loc[:,timepoint_subset].iloc[i])
            
            #assign bins for each timepoint. Add the data and timepoint to their respective bin
            binned_timepoints = assign_bin(timepoint_list, bin_list)
            bin_dict = dict(zip(bin_strings, [[] for j in range(0, len(bin_list))]))
            
            bin_timepoints = dict(zip(bin_strings, [[] for j in range(0, len(bin_list))]))
            for x in range(0, len(binned_timepoints)):
                bin_ = "bin_" + str(binned_timepoints[x])
                bin_dict[bin_].append(data_list[x])
                bin_timepoints[bin_].append(timepoint_list[x])
                
            #find which bins do not have data
            empty_bins = list(compress(bin_strings, \
                                            [len(bin_dict[bin_]) == 0 or \
                                             sum([(missing_value(value)) for value in bin_dict[bin_]]) == len(bin_dict[bin_]) for bin_ in bin_strings]))
            #find which bins have data
            bins_with_data = list(compress(bin_strings, [np.invert(bin_ in empty_bins) for bin_ in bin_strings]))
            
            #for the bins which do not have data, find the closest bin by abs(midpoint_of_bin - average_of_other_bin_timepoints)
            for bin_ in empty_bins:
                closest_bin_value = closest_bin_with_data(bin_, bin_time_dict[bin_], bins_with_data, bin_timepoints)
                if closest_bin_value != None:
                    bin_dict[bin_] = bin_dict[closest_bin_value]
            
            #for each bin, add the element to the dictionary
            for bin_ in bin_strings:
                new_column = bin_ +"_" + column_type
                bin_data = bin_dict[bin_]
                
                if len(bin_data) == 0 or sum(value != value for value in bin_data) == len(bin_data):
                    continue
                else:
                    try:
                        newdf[new_column] = np.nanmean(bin_data)
                    except TypeError:
                        newdf[new_column] = get_mode(bin_data)
        #append the dictionary to the main list
        dict_list.append(newdf)
    
    #add the file name back to the beginning of each column, aside from the "MaskID"
    return_frame = pd.DataFrame(dict_list)
    return_frame = pd.merge(dataframe.loc[:,columns_no_observations], return_frame, on = "MaskID", how = "outer")
    return_frame_columns = list(return_frame.columns)
    return_frame_columns = return_frame_columns[1:len(return_frame_columns)]
    return_frame_columns = [name + "_" + column for column in return_frame_columns]
    return_frame_columns.insert(0, 'MaskID')
    return_frame.columns = return_frame_columns
    return return_frame

def remove_columns_with_all_missing(data_frame):
    """
    Removes columns in the given DataFrame where all values are missing.
 
    Parameters:
    - data_frame (pd.DataFrame): The input DataFrame.
 
    Returns:
    - pd.DataFrame: A new DataFrame with columns containing all missing values removed.
    """
    # Find columns with all missing values
    columns_with_all_missing = data_frame.columns[data_frame.isnull().all()].tolist()
 
    # Drop columns with all missing values
    new_data_frame = data_frame.drop(columns=columns_with_all_missing)
 
    return new_data_frame

def summary_stat(value_list):
    if len(value_list) == 0:
        return np.nan
    if sum([missing_value(val) for val in value_list]) == len(value_list):
        return np.nan
    try:
        return_value = np.nanmean(value_list)
    except TypeError:
        return_value = get_mode(value_list)
    return return_value

def summarize_observations(dataframe, name):
    #add the "_" to the file name
    name = name + "_"
    
    #remove the file name from the column list
    column_list = list(dataframe.columns)
    column_list = [re.sub(name, '', column) for column in column_list]
    dataframe.columns = column_list
    #remove MaskID from the column list, as we will not manipulate that
    
    column_list = column_list[1:len(column_list)]
    column_list = list(dataframe.columns)
    column_list = column_list[1:len(column_list)]
    
    column_type_list = np.unique([re.sub('observation_[0-9]+_', '', column) for column in column_list])
    return_frame = pd.DataFrame(data = dataframe["MaskID"], columns = ["MaskID"])
    for column_type in column_type_list:
        column_set = list(compress(column_list, \
                                  [re.search(column_type, column) != None for column in column_list]))
        return_frame[column_type] = dataframe[column_set].apply(summary_stat, 1)
    
    return_frame_columns = list(return_frame.columns)
    return_frame_columns = return_frame_columns[1:len(return_frame_columns)]
    return_frame_columns = [name + column for column in return_frame_columns]
    return_frame_columns.insert(0, 'MaskID')
    #drop sample_mask_id if it exists
    sample_id_columns = list(compress(return_frame_columns, [re.search("sample_mask_id", column, re.IGNORECASE) != None for column in return_frame_columns]))
    
    return_frame.columns = return_frame_columns
    return_frame.drop(sample_id_columns, axis = 1, inplace = True)
    return return_frame


In [3]:
#file_directory = "DATA/TEDDY/"
file_directory = "/home/NIDDK/DATA/TEDDY/"
file_list = os.listdir(file_directory)

#comment/remove the line below to include these files. Will take a very long time
#exclusion_list = ["TEST_RESULTS.csv", "FAMILY_HISTORY.CSV", "teddybook2_5.csv","teddybook6_12.csv", "TEDDYBOOK.CSV"]
#file_exclusion_mask = np.invert([(file in exclusion_list) for file in file_list])
#file_list = ["TEST_RESULTS.csv"]
#file_list = list(compress(file_list, file_exclusion_mask))
time1 = time.time()
file_list = [file_directory + file for file in file_list]
file_list = file_list
file_list_mask = [".csv" in file.lower() for file in file_list]# for catching non-csv files
file_list = list(compress(file_list, file_list_mask))

dataframe_list = [read_teddy_file_create_observations(file) for file in file_list]
dataframe_mask = [len(x) > 0 for x in dataframe_list] # for catching empty dataframes
dataframe_list = list(compress(dataframe_list, dataframe_mask))

merged_data = reduce(lambda left, right: pd.merge(left, right, on = "MaskID",how = "outer"), dataframe_list)
merged_data.to_csv("Raw-Dataset.csv", index = False)
print(merged_data.shape)
print("The number of the numeric features of the raw data set is: %d"%(len(merged_data.select_dtypes(include='number').columns)))
print("The running time of generating the raw merged dataset is: %.2fs"%(time.time()-time1))

time2 = time.time()
filtered_dataframe = remove_columns_with_all_missing(merged_data)

merged_column_list = list(filtered_dataframe.columns)
columns_with_observations = list(compress(merged_column_list, [re.search("_observation_", column) for column in merged_column_list]))
files_with_observations = np.unique([re.search("(.*)_observation.*", column)[1] for column in columns_with_observations])
files_with_observations

filtered_and_binned_dataframe = filtered_dataframe
files_with_no_timepoints = []
for file in files_with_observations:
    timepoint_columns = get_timepoint(filtered_and_binned_dataframe, file)
    if timepoint_columns == None:
        print("No timepoint found in: " + file)
        files_with_no_timepoints.append(file)
        continue
    print("processing: " + file +" on: " + timepoint_columns)
    filtered_columns = list(compress(filtered_and_binned_dataframe.columns, [re.search("^" + file + "_", column) != None for column in filtered_and_binned_dataframe.columns] ))
    filtered_columns.insert(0, "MaskID")
    binned_frame = gather_observations_and_bin(filtered_and_binned_dataframe.loc[:,filtered_columns].copy(), timepoint_columns, 3, file)
    #drop the original columns
    columns_to_drop = filtered_columns[1:len(filtered_columns)]
    filtered_and_binned_dataframe.drop(columns_to_drop, axis = 1, inplace = True)
    filtered_and_binned_dataframe = pd.merge(filtered_and_binned_dataframe, binned_frame, on = "MaskID", how = "outer")

for file in files_with_no_timepoints:
    filtered_columns = list(compress(filtered_and_binned_dataframe.columns, [re.search("^" + file + "_", column) != None for column in filtered_and_binned_dataframe.columns]))
    filtered_columns.insert(0, "MaskID")
    summarized_data = summarize_observations(filtered_and_binned_dataframe.loc[:, filtered_columns], file)
    columns_to_drop = filtered_columns[1:len(filtered_columns)]
    filtered_and_binned_dataframe.drop(columns_to_drop, axis = 1, inplace = True)
    filtered_and_binned_dataframe = pd.merge(filtered_and_binned_dataframe, summarized_data, on = "MaskID", how = "outer")

filtered_and_binned_dataframe.to_csv("Binned_Dataframe.csv", index = False)
print(filtered_and_binned_dataframe.shape)
print("The running time of generating the binned merged dataset is: %.2fs"%(time.time()-time2))


(817, 71928)
The running time of generating the raw merged dataset is: 120.78s
processing: ANNUAL_CHILD_QUESTIONNAIRE on: EVENT_AGE
processing: ANNUAL_QUESTIONNAIRE on: EVENT_AGE
No timepoint found in: ASCORBIC_ACID_NIDDK
No timepoint found in: CAROTENOIDS_NIDDK
processing: CBCL on: EVENT_AGE
processing: CELIAC_DISEASE_DIAGNOSIS_FORM on: EVENT_AGE
processing: CHANGE_IN_STUDY_PARTICIPATION on: EFFECTIVE_AGE
No timepoint found in: CHOLESTEROL_NIDDK
processing: DIABETES_MANAGEMENT on: EVENT_AGE
processing: FAMILY_HISTORY on: EVENT_AGE
No timepoint found in: FAMILY_RELATIVE
No timepoint found in: FATTY_ACIDS_NIDDK
processing: GLUTEN_FREE_DIET_UPDATE on: EVENT_AGE
processing: LAST_QUESTIONNAIRE on: EVENT_AGE
processing: MMTT_PROCEDURE_FORM on: EVENT_AGE
processing: NINE_MONTH_PARENT_QUESTIONNAIRE on: EVENT_AGE
processing: NON_TEDDY_RESEARCH_FORM on: EVENT_AGE
No timepoint found in: OLINK_INFLAMMATION_NIDDK
processing: PARENT_PEDSQL_5_7 on: EVENT_AGE
processing: PARENT_PEDSQL_8_12 on: EVENT_

In [4]:
# Step 2: Process or clean the strings (texts) in the data frame, including convert strings to lowercase and lemmatization.
# Input: data frame generated in Step 1
# Output: data frame
def preprocess_text(text):
    # detect language
    #lang = detect(text.lower())

    # Tokenize
    #try:
    #    tokens = word_tokenize(text)
    #except TypeError:
    #    try:
    #        text = text.decode()
    #    except AttributeError:
    #        print(text)
    #        print(type(text))
    #    tokens = word_tokenize(text)
    try:
        tokens = word_tokenize(text)
    except:
        text = str(text)
        tokens = word_tokenize(text)
    #tokens = word_tokenize(text)
    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    #if lang != 'en':
    #    tokens = [translator.translate(word, dest='en') for word in tokens]
    
    # Remove stop words
    #stop_words = set(stopwords.words('english'))
    #tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    

    # Check if the list of tokens is not empty
    if tokens:
        # Reassemble the text
        cleaned_text = ' '.join(tokens)
        return cleaned_text
    else:
        # If the list is empty, return the original text
        return text
    
    return cleaned_text


In [5]:
#cleaned_text_df = pd.read_csv('NoEmptyColumns-Dataset.csv', low_memory=False)
time3 = time.time()
cleaned_text_df = filtered_and_binned_dataframe.copy()
for column in cleaned_text_df.select_dtypes(include=['object']).columns:
    if column != 'MaskID':
        cleaned_text = cleaned_text_df[column].fillna('not reported').apply(preprocess_text)
        cleaned_text_df[column] = cleaned_text

cleaned_text_df.to_csv('PreprocessedText_Dataset.csv', index=False)
print(cleaned_text_df.shape)
print("The running time of cleaning the text in dataset is: %.2fs"%(time.time()-time3))

(817, 1433)
The running time of cleaning the text in dataset is: 25.13s


In [19]:
# Step 3.1: Impute the missing values in the numeric columns with the completeness is larger than the threshold using Multiple Imputation by Chained Equation algorithm (MICE)
# Input: data frame with only numeri columns generated in Step 2
# Output: data frame
def completeness_criteria(column, threshold):
    non_null_percentage = column.count() / len(column)
    return non_null_percentage > threshold

def filter_df_by_completeness(df, completeness_threshold):
    # Apply the custom criteria function to each column
    selected_columns = df.apply(completeness_criteria, threshold=completeness_threshold)
    # Extract the column names that meet the completeness criteria
    selected_columns = selected_columns[selected_columns].index
    return df[selected_columns]
    

def impute_missing_numeric_values(df, ignored_categorical_columns, completeness_threshold):
    numeric_cols_org = df.select_dtypes(include='number').columns
    print("The number of the original numeric features from the binned dataframe is: %d"%(len(numeric_cols_org)))
    df_filtered = filter_df_by_completeness(df, completeness_threshold)
    print("data frame is filtered by the completness.")
    impute_data = df_filtered.copy()
    impute_data_v2_tmp = df_filtered.copy()
    
    # Separate numeric and categorical columns
    numeric_cols = impute_data.select_dtypes(include='number').columns
    categorical_cols = impute_data.select_dtypes(exclude='number').columns
    print("The number of the numeric features filtered by completeness threshold %.2f is: %d"%(completeness_threshold, len(numeric_cols)))
    #print(len(numeric_cols))
    #print(numeric_cols)
    # Impute missing values in numeric columns using IterativeImputer with default estimator
    numeric_imputer = IterativeImputer(max_iter=10, random_state=100)
    # Train the imputor model
    numeric_imputer.fit(impute_data[numeric_cols])
    # Predict the missing values. This is done using the `transform` method.
    impute_data[numeric_cols] = numeric_imputer.transform(impute_data[numeric_cols])
    impute_data_v2_tmp[numeric_cols] = impute_data_v2_tmp[numeric_cols].fillna("Imputed")
    impute_data_v2 = impute_data_v2_tmp[numeric_cols]
    impute_data_v2.insert(0, "MaskID", list(impute_data_v2_tmp.loc[:,"MaskID"]))
    #impute_data_v2 = impute_data_v2_tmp
    return (impute_data, impute_data_v2)


In [20]:
Completeness_threshold = 0.4
ignored_categorical_columns = ["MaskID"]
time4 = time.time()
merged_data_imputed, merged_data_imputed_labeled = impute_missing_numeric_values(cleaned_text_df, ignored_categorical_columns, Completeness_threshold)
merged_data_imputed.to_csv('Imputed_Dataset.csv', index=False)
merged_data_imputed_labeled.to_csv('Imputed_Labeled_Dataset.csv', index=False)
print(merged_data_imputed.shape)
print("The running time of imputing missing numeric values is: %.2fs"%(time.time()-time4))

The number of the original numeric features from the binned dataframe is: 936
data frame is filtered by the completness.
The number of the numeric features filtered by completeness threshold 0.40 is: 299


  impute_data_v2.insert(0, "MaskID", list(impute_data_v2_tmp.loc[:,"MaskID"]))


(817, 796)
The running time of imputing missing numeric values is: 141.48s


In [24]:
# Step 3.2: Encode the categorical columns in the data frame
# Input: data frame with only categorical columns generated in Step 2
# Output: data frame
def one_hot_encode_dataframe(data_frame):
    """
    Objective: Performs one-hot encoding on categorical columns in the dataframe
    
    Parameters:
        - data_frame (pd.DataFrame): Input dataframe
 
    Returns:
        - pd.DataFrame: A new DataFrame with one-hot encoded categorical columns and original numeric columns
    """
   
    #Save the first column then drop it from the dataframe to avoid encoding
    first_column = data_frame.iloc[:, 0]
    data_frame = data_frame.iloc[: , 1:]
    
    #Filter categorical columns within the dataframe
    categorical_cols_df = data_frame.select_dtypes(include=['object']).copy()
    
    #Fill-in missing values in categorical columns
    categorical_cols_df = categorical_cols_df.fillna('not reported')
    
    #Initialize OneHotEncoding
    one_hot_encoding = OneHotEncoder(sparse_output=False, dtype=np.int64)
    
    #Dictionary to store mapping of encoded vallues
    #ohe_dict = {}

    #Apply one hot encoding to all categorical columns
    for col in categorical_cols_df.columns:
        ohe_data = pd.DataFrame(one_hot_encoding.fit_transform(categorical_cols_df[categorical_cols_df.columns]),
                            columns= one_hot_encoding.get_feature_names_out(categorical_cols_df.columns))
        #ohe_dict[col] = dict(zip(  ,one_hot_encoding.categories_))
  
    #Concat original numerical columns to one-hot encoded columns only
    ohe_data = pd.concat([ohe_data, data_frame.drop(columns= categorical_cols_df.columns)], axis=1)
    
    #Place ID column back in the dataframe
    ohe_data.insert(0, 'MaskID', first_column)
    
    return ohe_data


In [25]:
time5 = time.time()
one_hot_encoded_data = one_hot_encode_dataframe(merged_data_imputed)
print(one_hot_encoded_data.shape)
print("The running time one hot encoding is: %.2fs"%(time.time()-time5))

(817, 6832)
The running time one hot encoding is: 106.74s


In [33]:
# Step 4: Write a csv file from the data frame genreated in Step 3.2
one_hot_encoded_data.to_csv('TEDDY_data_final.csv', index=False)