# Functions to handle nested lists & dictionaries in dataframes

changelog:
- change 'dataframe' parameters to 'df'
- change return statements (remove parens)
- columns_to_expand: change row index 1 to 0 (0-based python indexing), make list comprehension
- unnest_column: modify df & assign back to df instead of new df with more complex name (saves from creating variables just to change and not use them)
- expand_columns: 
    - remove \_i\_ variables to save space (instead of making new variables for every loop)
    - change loop index to be col instead of i (better readability)
    - remove code that casts each label as a string, since that is already the default for the code
    - change vars for column name expansion to be more readable
    - add 'remove' flag for removing source column (would be redundant info)

### Flatten nested DataFrame

In [11]:
def is_nested(col):
    return any([True for x in col if type(x) in (dict, list)])

In [8]:
def expand_series(series):
    # Ensure variable is of proper type
    try:
        assert isinstance(series, pd.Series)
    except AssertionError:
        raise ValueError(f'series argument must be of type pd.Series, not {type(series)}')
        
    ## Expand column
    col_expand = series.apply(pd.Series).dropna(axis=1, how='all')
    col_name = series.name

    ## Rename expanded columns to clarify source
    #add source column name prefix to each new column to identify source
    col_expand_names = [f'{col_name}_{nested_col}' for nested_col in col_expand.columns]
    #rename columns
    col_expand.columns = col_expand_names

    return col_expand

In [9]:
def flatten_nested_df(df):
    """
    Takes in a DataFrame and flattens any nested columns.
    
    Params:
    - df: pd.DataFrame
    """
    # If the df is empty, then we want to return
    if df.empty:
        return df
    # If the df is a Series, convert to DataFrame
    if isinstance(df, pd.Series):
        df = pd.DataFrame(df)
    # If there's only one column & it's not nested, return it
    if df.shape[1] == 1:
        if not is_nested(df.iloc[:,0]):
            return df
    
    # Slice the first column off of the DataFrame
    first_col, df = df.iloc[:, 0], df.iloc[:, 1:]
    
    # If the first column is nested, unnest it
    if is_nested(first_col):
        first_col = expand_series(first_col)
    
    # Want to flatten the first column again (in case nested nested) and combine with the rest of the flattened df
    return flatten_nested_df(first_col).join(flatten_nested_df(df))

In [18]:
%%timeit
df = pd.DataFrame(output['hits']['hits'])
flatten_nested_df(df).head()

KeyboardInterrupt: 

In [21]:
pd.DataFrame([flatten(result) for result in output['hits']['hits']])

Unnamed: 0,conceptdoi,conceptrecid,created,doi,files_0_bucket,files_0_checksum,files_0_key,files_0_links_self,files_0_size,files_0_type,...,files_206_key,files_206_links_self,files_206_size,files_206_type,files_207_bucket,files_207_checksum,files_207_key,files_207_links_self,files_207_size,files_207_type
0,10.5281/zenodo.3662112,3662112,2020-02-11T09:00:48.461959+00:00,10.5281/zenodo.3662113,54dc6794-9135-4217-af9e-cc1649960b90,md5:c4da02730a59a3dea2f1cbad807ac198,opium-sh/prl-v0.1.0.zip,https://zenodo.org/api/files/54dc6794-9135-421...,54220.0,zip,...,,,,,,,,,,
1,10.5281/zenodo.3530883,3530883,2019-11-06T21:34:30.631777+00:00,10.5281/zenodo.3530884,0b2a69fa-3fcf-46d6-b8e5-fbffe4118952,md5:643878148673ee42bd3dba97317de02a,kratzert/ealstm_regional_modeling-v1.0.zip,https://zenodo.org/api/files/0b2a69fa-3fcf-46d...,13877577.0,zip,...,,,,,,,,,,
2,10.5281/zenodo.4738769,4738769,2021-05-05T10:21:43.604973+00:00,10.5281/zenodo.4738770,fdefeabc-7897-4130-9628-438795c877c2,md5:0c8ea118118b0300a150b7f54ffc56e8,kratzert/multiple_forcing-v1.0.zip,https://zenodo.org/api/files/fdefeabc-7897-413...,133317.0,zip,...,,,,,,,,,,
3,,4768051,2021-05-17T17:53:16.165204+00:00,10.1007/s10994-021-05968-x,a43e8b77-a43a-488c-8e02-489f02047271,md5:82cb35e198d55ae12aef1e51f1aefb10,Škrlj2021_Article_AutoBOTEvolvingNeuro-symbol...,https://zenodo.org/api/files/a43e8b77-a43a-488...,3000278.0,pdf,...,,,,,,,,,,
4,,3461067,2019-09-26T00:12:55.671236+00:00,10.1007/s10994-019-05800-7,9503c5c7-366f-42f6-8400-136e06048bf5,md5:1154a531d98a264cf0be1c2ff5c6df58,mlsvm-1.1.0.tar.gz,https://zenodo.org/api/files/9503c5c7-366f-42f...,15351705.0,gz,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,10.5281/zenodo.3885379,3885379,2020-06-08T13:45:13.634996+00:00,10.5281/zenodo.3885380,7f3afc61-6fd0-490d-b33c-4ebb9f4114f3,md5:eac2b256689593bb3e64b00086ac3180,Report_Ravi_Charan Nudurupati.pdf,https://zenodo.org/api/files/7f3afc61-6fd0-490...,1222283.0,pdf,...,,,,,,,,,,
996,10.5281/zenodo.4330625,4330625,2020-12-17T00:12:53.596641+00:00,10.5281/zenodo.4330626,f153353f-3eb6-42ae-8c39-499472ccdf18,md5:257084366693d11c1ac4095d7dd9d06b,czi_2020.pptx,https://zenodo.org/api/files/f153353f-3eb6-42a...,89373190.0,pptx,...,,,,,,,,,,
997,10.5281/zenodo.3982839,3982839,2020-08-13T16:31:08.841046+00:00,10.5281/zenodo.3982840,0f00582c-eeff-4529-b26a-e57d03f17487,md5:70be06a4f15f50744b9a8ab1d0c87b93,01 Paper 01072023 IJCSIS Camera Ready pp1-5.pdf,https://zenodo.org/api/files/0f00582c-eeff-452...,709009.0,pdf,...,,,,,,,,,,
998,10.5281/zenodo.4738546,4738546,2021-05-05T08:27:32.575962+00:00,10.5281/zenodo.4738547,715bcc1b-7643-4f77-bd70-d1af56c99f92,md5:fe0e423ffde47ff3c9bc4132622bfd72,STOCK_MARKET_PREDICTION_USING_ANFIS_MACHINE_LE...,https://zenodo.org/api/files/715bcc1b-7643-4f7...,426325.0,pdf,...,,,,,,,,,,


### Example using Zenodo API output

In [2]:
#import libraries
import os
import requests
import pandas as pd
import pickle
import pprint as pp
from flatten_json import flatten

In [3]:
#load credentials
with open('credentials.pkl', 'rb') as credentials:
        ZENODO_TOKEN = pickle.load(credentials)['ZENODO_TOKEN']

In [15]:
#Specify search term
SEARCH_TERM = 'machine learning'

#Specify parameters
PARAMS = {'q': SEARCH_TERM,
         'access_token': ZENODO_TOKEN,
         'page': 1,
         'size': 1000}

#Search
response = requests.get('https://zenodo.org/api/records', 
                        params = PARAMS)

In [16]:
#Put output into json format
output = response.json()

In [6]:
#View output
pp.pprint(output)

{'aggregations': {'access_right': {'buckets': [{'doc_count': 28, 'key': 'open'},
                                               {'doc_count': 3,
                                                'key': 'closed'}],
                                   'doc_count_error_upper_bound': 0,
                                   'sum_other_doc_count': 0},
                  'file_type': {'buckets': [{'doc_count': 6, 'key': 'txt'},
                                            {'doc_count': 5, 'key': 'csv'},
                                            {'doc_count': 5, 'key': 'pdf'},
                                            {'doc_count': 4, 'key': 'html'},
                                            {'doc_count': 4, 'key': 'xlsx'},
                                            {'doc_count': 3, 'key': 'zip'},
                                            {'doc_count': 2, 'key': 'gz'},
                                            {'doc_count': 2, 'key': 'nex'},
                                            {'do

                                                'generalized linear mixed '
                                                'models and a model averaging '
                                                'approach, we examined the '
                                                'impacts of environmental and '
                                                'individual phenotypic '
                                                'parameters on FGMs. We found '
                                                'pronounced interspecific '
                                                'differences, with '
                                                'environmental parameters '
                                                'being better predictors of '
                                                'FGMs in T. alpinus. FGMs in '
                                                'this species were '
                                                'particularly elevated in less '
      

In [None]:
#Convert output to pd dataframe and see table format
output_df = pd.json_normalize(output)

output_df

In [None]:
## Extract 'hits.hits' column and convert to dataframe
# note that name of column needs to be as string
hits_df = unnest_column(output_df, 'hits.hits')

In [None]:
output_df['hits.hits']

In [None]:
hits_df.head()

In [None]:
## Determine which columns are themselves lists or dictionaries and need to be unnested
cols_nested = columns_to_expand(hits_df)

In [None]:
cols_nested

In [None]:
## For each list or dict column, expand column, rename columns to clarify source, and merge
unnested_output_ = expand_columns(hits_df, cols_nested)

In [None]:
unnested_output.head()

In [None]:
unnested_output[['files_0', 'files_1', 'files_2', 'files_3']]

In [None]:
test_dict = {'a': [test_dict, test_dict], 'b': [4,5,6]}
df = pd.DataFrame(test_dict)

In [None]:
df