## Optimizing the dataset for the big merge
source: https://www.dataquest.io/blog/pandas-big-data/ 

In [None]:
import pandas as pd
import numpy as np

#read in datasets
df1 = pd.read_csv("C:/Users/Maggie/OneDrive/UW-BHI/2018Fall/CSE583/Project/mimic_merge4.csv")
df2 = pd.read_csv("C:/Users/Maggie/OneDrive/UW-BHI/2018Fall/CSE583/Project/mimic_prescriptions.csv")

In [None]:
df1.info(memory_usage='deep')

In [None]:
df2.info(memory_usage='deep')

In [None]:
#optimize numeric columns with subtypes
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

df1_int = df1.select_dtypes(include=['int64'])
df1_converted_int = df1_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(df1_int))
print(mem_usage(df1_converted_int))

compare_ints = pd.concat([df1_int.dtypes,df1_converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

In [None]:
df2_int = df2.select_dtypes(include=['int64'])
df2_converted_int = df2_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(df2_int))
print(mem_usage(df2_converted_int))

compare_ints = pd.concat([df2_int.dtypes,df2_converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

In [None]:
#repeat with floats for df1
df1_float = df1.select_dtypes(include=['float'])
df1_converted_float = df1_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(df1_float))
print(mem_usage(df1_converted_float))

compare_floats = pd.concat([df1_float.dtypes,df1_converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

In [None]:
#repeat with floats for df2
df2_float = df2.select_dtypes(include=['float'])
df2_converted_float = df2_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(df2_float))
print(mem_usage(df2_converted_float))

compare_floats = pd.concat([df2_float.dtypes,df2_converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

In [None]:
#create copy of original dataframes, assign optimized columns in place, and check improvements
optimized_df1 = df1.copy()

optimized_df1[df1_converted_int.columns] = df1_converted_int
optimized_df1[df1_converted_float.columns] = df1_converted_float

print(mem_usage(df1))
print(mem_usage(optimized_df1))

In [None]:
#create copy of original dataframes, assign optimized columns in place, and check improvements
optimized_df2 = df2.copy()

optimized_df2[df2_converted_int.columns] = df2_converted_int
optimized_df2[df2_converted_float.columns] = df2_converted_float

print(mem_usage(df2))
print(mem_usage(optimized_df2))

In [None]:
df1_obj = df1.select_dtypes(include=['object']).copy()
df1_obj.describe()

In [None]:
df2_obj = df2.select_dtypes(include=['object']).copy()
df2_obj.describe()

In [None]:
#loop to iterate over each object column, 
#check if the number of unique values is less than 50%, 
#and if so, convert it to the category type.

converted_obj_df1 = pd.DataFrame()

for col in df1_obj.columns:
    num_unique_values = len(df1_obj[col].unique())
    num_total_values = len(df1_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj_df1.loc[:,col] = df1_obj[col].astype('category')
    else:
        converted_obj_df1.loc[:,col] = df1_obj[col]

In [None]:
converted_obj_df2 = pd.DataFrame()

for col in df2_obj.columns:
    num_unique_values = len(df2_obj[col].unique())
    num_total_values = len(df2_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj_df2.loc[:,col] = df2_obj[col].astype('category')
    else:
        converted_obj_df2.loc[:,col] = df2_obj[col]

In [None]:
print(mem_usage(df1_obj))
print(mem_usage(converted_obj_df1))

compare_obj_df1 = pd.concat([df1_obj.dtypes,converted_obj_df1.dtypes],axis=1)
compare_obj_df1.columns = ['before','after']
compare_obj_df1.apply(pd.Series.value_counts)

In [None]:
print(mem_usage(df2_obj))
print(mem_usage(converted_obj_df2))

compare_obj_df2 = pd.concat([df2_obj.dtypes,converted_obj_df2.dtypes],axis=1)
compare_obj_df2.columns = ['before','after']
compare_obj_df2.apply(pd.Series.value_counts)

In [None]:
optimized_df1[converted_obj_df1.columns] = converted_obj_df1

print(mem_usage(optimized_df1))
print(mem_usage(df1))

In [None]:
optimized_df2[converted_obj_df2.columns] = converted_obj_df2

print(mem_usage(optimized_df2))
print(mem_usage(df2))

In [None]:
optimized_df1.info(memory_usage='deep')

In [None]:
optimized_df2.info(memory_usage='deep')

In [None]:
df_merge5 = pd.merge(optimized_df1, optimized_df2, how='inner', left_on=['hadm_id', 'subject_id'], right_on=['hadm_id', 'subject_id'])

In [None]:
#Export
#df_merge5.to_csv("C:/Users/Maggie/OneDrive/UW-BHI/2018Fall/CSE583/Project/mimic_merge5.csv")

In [None]:
len(df_merge5)

In [None]:
mem_usage(df_merge5)

In [None]:
df_merge5.dtypes