In [81]:
import os, glob
import pandas as pd
import warnings
import matplotlib.pyplot as plt

%matplotlib inline
warnings.filterwarnings('ignore')

## Get Data  
  
Currently our script uses data X,..... 

In [82]:
# Get the directory path and collect the child and mom files
# --> We may need/want to differentiate b/w text files 04 and 09 (both related to Nutrition intake)
# --> For `glob` we need to know naming of text files - (any convention?)

In [83]:
data_dir_path = "/Users/nikkibytes/Documents/summer2019/example_data"

child04_text_files = glob.glob(os.path.join(data_dir_path, "*Child*04*"))
mom04_text_files = glob.glob(os.path.join(data_dir_path, "*Mom*04*"))

print("Text Files Found:  \nFor Mom: {} \nFor Child: {}".format(','.join(child04_text_files), ','.join(mom04_text_files)))
      
      

Text Files Found:  
For Mom: /Users/nikkibytes/Documents/summer2019/example_data/UMNUNCChild04.txt,/Users/nikkibytes/Documents/summer2019/example_data/UMNUNCChild04_.txt 
For Child: /Users/nikkibytes/Documents/summer2019/example_data/UMNUNCMom04.txt


In [84]:
# 'Sphingomyelin (mg)', 'Phosphatidycholine (mg)' -- these two not found in current dataset

## Child Data

In [None]:
# Here we concatenate the children 
#

df_list = []
for file in child04_text_files:
    orig_df=pd.read_csv(file, sep="\t")
    updated_df=orig_df[['Participant ID', 'Visit Number', 'Site ID', 'Total Grams',
                 'Energy (kcal)', 'Total Fat (g)', 'Total Carbohydrate (g)', 
                 'Total Protein (g)', 'Animal Protein (g)', 'Vegetable Protein (g)',
                 'Cholesterol (mg)', 'Total Saturated Fatty Acids (SFA) (g)',
                 'Total Monounsaturated Fatty Acids (MUFA) (g)', 'Total Polyunsaturated Fatty Acids (PUFA) (g)',
                 'Total Vitamin A Activity (International Units) (IU)', 'Vitamin D (calciferol) (mcg)', 
                 'Total Folate (mcg)', 'Vitamin B-12 (cobalamin) (mcg)', 'Magnesium (mg)',
                 'Iron (mg)', 'Zinc (mg)', 'Copper (mg)', 'Selenium (mcg)', 'PUFA 20:4 (arachidonic acid) (g)',
                 'PUFA 20:5 (eicosapentaenoic acid [EPA]) (g)', 'PUFA 22:6 (docosahexaenoic acid [DHA]) (g)', 
                 '% Calories from Fat', '% Calories from Carbohydrate', '% Calories from Protein', 'Polyunsaturated to Saturated Fat Ratio',
                 'Lutein + Zeaxanthin (mcg)', 'Choline (mg)']]
    for val in updated_df["Participant ID"]:
        _id = val.split("_")[0]
        #print(_id)
        updated_df.replace(val, _id, inplace=True)

    #print(updated_df.head())
    df_list.append(updated_df)
    #print(updated_df.shape)


print("List of dataframes made")

In [None]:
#print(df_list)
concat_df = pd.concat(df_list, ignore_index=True)
concat_df = concat_df.sort_values(by="Participant ID")
#print(final_df.head())
#print(final_df.tail())
#print(final_df.shape)
concat_df.head()


### Write child concatenated file (all available data on a single spread sheet, with reduced columns)

In [None]:
concat_filename = 'Child_concat_test1'
csv_filename = "{}.csv".format(concat_filename)
xlsx_filename = "{}.xlsx".format(concat_filename)
concat_filepath = os.path.join(data_dir_path, "outputs", csv_filename)
xlsx_filepath = os.path.join(data_dir_path, "outputs", xlsx_filename)


#print( new_filepath)
# write concatentation to file
writer = pd.ExcelWriter(xlsx_filepath, engine='xlsxwriter')
concat_df.to_excel(writer, sheet_name='Sheet1', index=False)
#concat_df.to_excel(concat_filepath)
concat_df.to_csv(concat_filepath, index=False)
print("Written concat file, ", concat_filepath)
umn_df = concat_df[concat_df['Site ID'] == "UMN"]
umn_df.head()

In [None]:
umn_df['Total Fat (g)'].plot(kind='hist',colormap="plasma")


In [None]:
unc_df = concat_df[concat_df['Site ID'] == "UNC"]
unc_df.head()

In [None]:
unc_df['Total Fat (g)'].plot(kind='hist',colormap="plasma")


In [None]:
child_mean_df=concat_df.groupby("Participant ID").mean()
child_mean_df=child_mean_df.drop(columns="Visit Number")
child_mean_df.head()

## Looking at Site Differences

In [None]:
child_mean_site_df=concat_df.groupby("Site ID").mean()
child_mean_site_df.head()

In [None]:
child_df_T = child_mean_site_df.T
child_df_T.head()


In [None]:
child_df_T.plot(kind="scatter", x="UMN", y='UNC')

### Write child mean file (all available data on a single spread sheet from concatenated file with subjects averaged across visits)

In [None]:
ch_mean_filename = 'Child_mean_test1'
csv_filename = "{}.csv".format(ch_mean_filename)
csv_filepath = os.path.join(data_dir_path, "outputs", csv_filename)
xlsx_filename = "{}.xlsx".format(ch_mean_filename)
xlsx_filepath = os.path.join(data_dir_path, "outputs", xlsx_filename)



writer = pd.ExcelWriter(xlsx_filepath, engine='xlsxwriter')
concat_df.to_excel(writer, sheet_name='Sheet1', index=False)
#print( new_filepath)
# write concatentation to file
child_mean_df.to_csv(xlsx_filepath)
print("Written mean file, ", xlsx_filepath)

In [None]:
child_mean_df.columns

In [None]:
child_mean_df['Cholesterol (mg)'].plot(kind='hist',colormap="plasma")



In [None]:
child_mean_df.plot(kind="scatter", x="Copper (mg)", y='Animal Protein (g)')

In [None]:
child_umn_df_mean = concat_df[concat_df['Site ID'] == "UMN"]


## Mom data

In [None]:
# Here we concatenate the children 
# **We won't want the original filename when we concatenate all the files

mom_df_list = []
for file in mom04_text_files:
    orig_df=pd.read_csv(file, sep="\t")
    updated_df=orig_df[['Participant ID', 'Visit Number', 'Site ID', 'Total Grams',
                 'Energy (kcal)', 'Total Fat (g)', 'Total Carbohydrate (g)', 
                 'Total Protein (g)', 'Animal Protein (g)', 'Vegetable Protein (g)',
                 'Cholesterol (mg)', 'Total Saturated Fatty Acids (SFA) (g)',
                 'Total Monounsaturated Fatty Acids (MUFA) (g)', 'Total Polyunsaturated Fatty Acids (PUFA) (g)',
                 'Total Vitamin A Activity (International Units) (IU)', 'Vitamin D (calciferol) (mcg)', 
                 'Total Folate (mcg)', 'Vitamin B-12 (cobalamin) (mcg)', 'Magnesium (mg)',
                 'Iron (mg)', 'Zinc (mg)', 'Copper (mg)', 'Selenium (mcg)', 'PUFA 20:4 (arachidonic acid) (g)',
                 'PUFA 20:5 (eicosapentaenoic acid [EPA]) (g)', 'PUFA 22:6 (docosahexaenoic acid [DHA]) (g)', 
                 '% Calories from Fat', '% Calories from Carbohydrate', '% Calories from Protein', 'Polyunsaturated to Saturated Fat Ratio',
                 'Lutein + Zeaxanthin (mcg)', 'Choline (mg)']]
    for val in updated_df["Participant ID"]:
        _id = val.split("_")[0]
        #print(_id)
        updated_df.replace(val, _id, inplace=True)

    #print(updated_df.head())
    mom_df_list.append(updated_df)
    #print(updated_df.shape)


print("List of dataframes made")

In [None]:
#print(df_list)
concat_df = pd.concat(mom_df_list, ignore_index=True)
concat_df = concat_df.sort_values(by="Participant ID")
#print(final_df.head())
#print(final_df.tail())
#print(final_df.shape)
concat_df.head()


In [None]:
umn_df = concat_df[concat_df['Site ID'] == "UMN"]
umn_df.head()

In [None]:
unc_df = concat_df[concat_df['Site ID'] == "UNC"]
unc_df.head()

### Write mom concatenated file (all available data on a single spread sheet, with reduced columns)

In [None]:
concat_filename='Mom_concat_test1'
csv_filename = "{}.csv".format(concat_filename)
csv_filepath = os.path.join(data_dir_path, "outputs", csv_filename)
xlsx_filename = "{}.xlsx".format(concat_filename)
xlsx_filepath = os.path.join(data_dir_path, "outputs", xlsx_filename)


#print( new_filepath)
# write concatentation to file
# write concatentation to file
writer = pd.ExcelWriter(xlsx_filepath, engine='xlsxwriter')
concat_df.to_excel(writer, sheet_name='Sheet1', index=False)
concat_df.to_csv(csv_filepath, index=False)
print("Written concat file, ", concat_filepath)

In [None]:
mom_mean_df=concat_df.groupby("Participant ID").mean()
mom_mean_df=mom_mean_df.drop(columns="Visit Number")
mom_mean_df.head()

### Write mom mean file (all available data on a single spread sheet from concatenated file with subjects averaged across visits)

In [None]:
mean_filename = "Mom_mean_test1"
csv_filename = "{}.csv".format(mean_filename)
mean_filepath = os.path.join(data_dir_path, "outputs", csv_filename)
xlsx_filename = "{}.xlsx".format(mean_filename)
xlsx_filepath = os.path.join(data_dir_path, "outputs", xlsx_filename)

writer = pd.ExcelWriter(xlsx_filepath, engine='xlsxwriter')
concat_df.to_excel(writer, sheet_name='Sheet1', index=False)
#print( new_filepath)
# write concatentation to file
mom_mean_df.to_csv(mean_filepath)
print("Written mean file, ", mean_filepath)

In [None]:
mom_mean_df['Total Grams'].plot(kind='hist', colormap="spring")


In [None]:
mom_mean_df.plot(kind="scatter", x="Copper (mg)", y='Animal Protein (g)')

## Looking at Site Differences

In [None]:
mom_mean_site_df=concat_df.groupby("Site ID").mean()
mom_mean_site_df.head()

In [None]:
mom_df_T = mom_mean_site_df.T
mom_df_T.head()


In [None]:
mom_df_T.plot(kind="scatter", x="UMN", y='UNC')

## Doing more....

In [None]:
grams_df_mom = mom_mean_df[['Total Grams']]
grams_df_child = child_mean_df[["Total Grams"]]

grams_df_child.head()

In [None]:
## Look at text file 09


child09_text_files = glob.glob(os.path.join(data_dir_path, "*Child*09*"))
mom09_text_files = glob.glob(os.path.join(data_dir_path, "*Mom*09*"))

print("Text Files Found:  \nFor Mom: {} \nFor Child: {}".format(','.join(child09_text_files), ','.join(mom09_text_files)))
      
      

### Gather dataframes from individual files and concatenate into a large dataframe for analysis

In [None]:
for file in child09_text_files:
    child_df = pd.read_csv(file, encoding='latin1', sep='\t')
child_df.head()
## concat multiple files together like above 
# ..........

In [None]:
# list column names
#for i in child_df.columns:
#    print(i)
# list categories of food from row 0 
#for i in child_df.iloc[[0]]:
#    print(child_df.loc[0, i])

In [None]:
for file in mom09_text_files:
    mom_df = pd.read_csv(file, encoding='latin1', sep='\t')
mom_df.head()