In [3]:
import requests
import json
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import urllib
from pprint import pprint

In [4]:
def get_data(url,**kwargs):
    url_params = "?"+urllib.parse.urlencode(kwargs)
    acctual_url = urllib.parse.urljoin(url,url_params)
    response = requests.get(acctual_url)
    num_pages = response.json()["last_page"]
    data = []
    for page in range(1,num_pages +1):
        url_current = acctual_url + f"&page={page}"
        response = requests.get(url_current)
        data += response.json()["data"]["data"]

    flatten_data = [flatten_json(d) for d in data]
    return pd.DataFrame(flatten_data)

In [5]:
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        #elif type(x) is list:
        #    i = 0
        #    for a in x:
        #        flatten(a, name + str(i) + '_')
        #       i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [6]:
def my_tuple(data):
    if any(data):
        if len(data) == 1:
            return tuple(data)[0]
        return tuple(data)
    else:
        np.NaN

In [9]:
url_characteristica = "http://0.0.0.0:8000/api/v1/characteristica_elastic/"
url_outputs = "http://0.0.0.0:8000/api/v1/outputs_elastic/"
url_timecourses = "http://0.0.0.0:8000/api/v1/timecourses_elastic/"
url_interventions = "http://0.0.0.0:8000/api/v1/interventions_elastic/"
url_individuals = "http://0.0.0.0:8000/api/v1/individuals_elastic/"
url_groups = "http://0.0.0.0:8000/api/v1/groups_elastic/"
base_params = {"format":"json", "final":"true"}


## Ouputs

Preprocess output data.

In [34]:
# load output data
df_outputs = get_data(url_outputs,**base_params)
# drop not important columnns (cleaning)
df_outputs = df_outputs.dropna(how="all",axis=1).drop(["final","individual_name","group_name"],axis=1)
# reduce the content of interventions to a lift of pk values of the internventions
df_outputs["interventions"] = df_outputs["interventions"].apply(lambda interventions: [x["pk"] for x in interventions])
# unstack outputs by the list of pks in the interventions column
lst_col = 'interventions'
df_outputs = pd.DataFrame({col:np.repeat(df_outputs[col].values, df_outputs[lst_col].str.len()) for col in df_outputs.columns.difference([lst_col])}).assign(**{lst_col:np.concatenate(df_outputs[lst_col].values)})[df_outputs.columns.tolist()]
# change interventions pks format to int
df_outputs.interventions = df_outputs.interventions.astype(int)
# sort columns by number of not nan values
df_outputs = df_outputs[df_outputs.apply(lambda x: x.count()).sort_values(ascending=False).index]
df_outputs.set_index("pk", inplace=True)

Save output data.

In [35]:
df_outputs.to_csv("outputs.tsv", sep="\t")
df_outputs.to_excel("outputs.xlsx")

## Timecourses

In [10]:
# load output data
df_timecourses = get_data(url_timecourses,**base_params)
# drop not important columnns (cleaning)
df_timecourses = df_timecourses.dropna(how="all",axis=1).drop(["final","individual_name","group_name"],axis=1)
# reduce the content of interventions to a lift of pk values of the internventions
df_timecourses["interventions"] = df_timecourses["interventions"].apply(lambda interventions: [x["pk"] for x in interventions])
# unstack outputs by the list of pks in the interventions column
lst_col = 'interventions'
df_timecourses = pd.DataFrame({col:np.repeat(df_timecourses[col].values, df_timecourses[lst_col].str.len()) for col in df_timecourses.columns.difference([lst_col])}).assign(**{lst_col:np.concatenate(df_timecourses[lst_col].values)})[df_timecourses.columns.tolist()]
# change interventions pks format to int
df_timecourses.interventions = df_timecourses.interventions.astype(int)
# sort columns by number of not nan values
df_timecourses = df_timecourses[df_timecourses.apply(lambda x: x.count()).sort_values(ascending=False).index]
df_timecourses.set_index("pk", inplace=True)

In [12]:
df_timecourses.to_csv("timecourses.tsv", sep="\t")
df_timecourses.to_excel("timecourses.xlsx")

## Interventions

In [36]:
# load output data
df_interventions = get_data(url_interventions,**base_params)
# drop not important columnns (cleaning)
df_interventions = df_interventions.dropna(how="all",axis=1).drop("final",axis=1)
# sort columns by number of not nan values
df_interventions = df_interventions[df_interventions.apply(lambda x: x.count()).sort_values(ascending=False).index]
df_interventions.set_index("pk", inplace=True)

Save output data.

In [37]:
df_interventions.to_csv("interventions.tsv", sep="\t")
df_interventions.to_excel("interventions.xlsx")

## Characteristica

In [13]:
# load individual and group data
base_params = {"format":"json"} 
df_individuals = get_data(url_individuals,**base_params)
df_groups = get_data(url_groups,**base_params)

In [14]:
def preprocess_characteristica(df_subject):
    lst_col = 'characteristica_all_final'
    df_subject = pd.DataFrame({col:np.repeat(df_subject[col].values, df_subject[lst_col].str.len()) for col in df_subject.columns.difference([lst_col])}).assign(**{lst_col:np.concatenate(df_subject[lst_col].values)})[df_subject.columns.tolist()]
    df = df_subject["characteristica_all_final"].apply(pd.Series)
    df["study"] = df_subject["study_name"]
    df.drop(["pk", "ctype"], axis=1,inplace=True)
    df["subject_pk"] = df_subject["pk"]
    df["subject_name"] = df_subject["name"]

    df = df.pivot_table(index=["study","subject_pk","subject_name"], columns=["category"], aggfunc=my_tuple)
    df.columns = df.columns.swaplevel(0, 1)
    df = df[df.groupby(level=0, axis=0).count().sum().max(level=0).sort_values(ascending=False).index]
    df.dropna(how="all", axis=1)

    return df


In [15]:
def merge_groups_individuals(df_groups_pivot,df_individuals_pivot):
    df = pd.concat([df_groups_pivot,df_individuals_pivot], keys=["group","individual"])   
    df.reset_index(inplace=True)
    df.rename(columns={"level_0":"subject_type"},inplace=True)
    df.set_index(["study","subject_type","subject_pk","subject_name"], inplace=True)
    df = df[df.groupby(level=0, axis=0).count().sum().max(level=0).sort_values(ascending=False).index]
    df = df.dropna(how="all", axis=1)
    return  df.sort_index()
    


In [16]:
df_individuals_pivot = preprocess_characteristica(df_individuals)
df_groups_pivot = preprocess_characteristica(df_groups)
df_all_subjects = merge_groups_individuals(df_groups_pivot,df_individuals_pivot)

In [17]:
df_groups[df_groups["study_name"]=="Bonati1982"]

Unnamed: 0,characteristica_all_final,count,name,parent,parent_name,parent_pk,pk,study_name,study_pk
0,"[{'pk': 1899, 'count': 4, 'category': 'age', '...",4,S1,,,,195,Bonati1982,7083737


In [18]:
df_groups_pivot.to_excel("groups.xlsx")
df_groups_pivot.to_csv("groups.tsv", sep="\t")

df_individuals_pivot.to_excel("individuals.xlsx")
df_individuals_pivot.to_csv("individuals.tsv", sep="\t")

df_all_subjects.to_csv("all_subjects.tsv", sep="\t")
df_all_subjects.to_excel("all_subjects.xlsx")