# dp_dgfp_summary

In [83]:
import os
import glob
import numpy as np
import pandas as pd

In [97]:
DATA = '../../data/dgfp/data/'
PATTERN = '*.csv'

In [173]:
def get_paths(path, pattern):
    return glob.glob(path+pattern)

def list_lowest_dirs(path):
    out = []
    for root, dirs, files in os.walk(path):
        if not dirs:
            out.append(root)
    return out

def file_summary(files_list, sep):
    out = {'file_path': [],
           'file_name': [],
           'file_shape': [],
           'file_nrows': [],
           'file_ncols': [],
           'file_memoryusage': [],
           'file_ndim': [],
          }
    for file in files_list:
        if os.stat(file).st_size < 2:
            pass
        else:
            tmp = pd.read_csv(file, sep=sep)
            out['file_path'].append(file)
            out['file_name'].append(os.path.split(file)[-1])
            out['file_shape'].append(tmp.shape)
            out['file_nrows'].append(tmp.shape[0])
            out['file_ncols'].append(tmp.shape[1])
            out['file_memoryusage'].append(tmp.memory_usage().sum())
            out['file_ndim'].append(tmp.ndim)
    out = pd.DataFrame.from_dict(out)
    return out

def check_size(input_df):
    df = input_df.copy(deep=True)
    if df.empty:
        df['large_enough'] = []
    else:
        df['large_enough'] = np.logical_and(np.where(df['file_memoryusage'] > 1000, True, False),
                       np.where(df['file_nrows'] > 1, True, False))
    return df


def check_size_dirs(dirs_list, sep, pattern):
    out = {}
    for folder in dirs_list:
        print(folder)
        files_list = get_paths(path=folder, pattern=pattern)
        out[folder] = file_summary(files_list, sep=sep)
        out[folder] = check_size(out[folder])
    return out

def select_dirs(data_dict):
    out = []
    for key, value in data_dict.items():
        if not value.empty:
            if any(value['large_enough']):
                out.append(key)
    return out

In [176]:
dirs_list = list_lowest_dirs(path=DATA)
print(len(dirs_list))
dirs_list

13


['../../data/dgfp/data/importance12_subdistricts/distributionGO_thanaconsuprocess',
 '../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanamonthProcess',
 '../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanaconsuprocess',
 '../../data/dgfp/data/importance12_subdistricts/distributionGO_thanamonthProcess',
 '../../data/dgfp/data/importance12_districts/mchgo_distmonthprocess',
 '../../data/dgfp/data/importance12_districts/ngodistrict_monthprocess',
 '../../data/dgfp/data/importance12_districts/distributionGO_distmonthProcess',
 '../../data/dgfp/data/importance12_districts/district_monthprocess',
 '../../data/dgfp/data/importance11_districts/district_process',
 '../../data/dgfp/data/importance11_districts/ngodistrict_process',
 '../../data/dgfp/data/importance11_subdistricts/ngothana_process',
 '../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess',
 '../../data/dgfp/data/importance11_subdistricts/thana_process']

In [170]:
test = check_size_dirs(dirs_list=dirs_list, sep='\t', pattern='/*.csv')

../../data/dgfp/data/importance12_subdistricts/distributionGO_thanaconsuprocess
../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanamonthProcess
../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanaconsuprocess
../../data/dgfp/data/importance12_subdistricts/distributionGO_thanamonthProcess
../../data/dgfp/data/importance12_districts/mchgo_distmonthprocess
../../data/dgfp/data/importance12_districts/ngodistrict_monthprocess
../../data/dgfp/data/importance12_districts/distributionGO_distmonthProcess
../../data/dgfp/data/importance12_districts/district_monthprocess
../../data/dgfp/data/importance11_districts/district_process
../../data/dgfp/data/importance11_districts/ngodistrict_process
../../data/dgfp/data/importance11_subdistricts/ngothana_process
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess
../../data/dgfp/data/importance11_subdistricts/thana_process


In [172]:
for key, value in test.items():
    print(key)
    display(value.head())

../../data/dgfp/data/importance12_subdistricts/distributionGO_thanaconsuprocess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0305-03.csv,"(7, 27)",7,27,1592,2,True
1,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0114-01.csv,"(6, 27)",6,27,1376,2,True
2,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0209-02.csv,"(9, 27)",9,27,2024,2,True
3,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0205-02.csv,"(4, 27)",4,27,944,2,False
4,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0108-07.csv,"(5, 27)",5,27,1160,2,True


../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanamonthProcess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough


../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanaconsuprocess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0305-03.csv,"(6, 22)",6,22,1136,2,True
1,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0114-01.csv,"(4, 22)",4,22,784,2,False
2,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0209-02.csv,"(5, 22)",5,22,960,2,False
3,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0205-02.csv,"(2, 22)",2,22,432,2,False
4,../../data/dgfp/data/importance12_subdistricts...,2018-12-28-0108-07.csv,"(2, 22)",2,22,432,2,False


../../data/dgfp/data/importance12_subdistricts/distributionGO_thanamonthProcess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough


../../data/dgfp/data/importance12_districts/mchgo_distmonthprocess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance12_districts/mc...,2018-12-28-0305-03.csv,"(33, 72)",33,72,19088,2,True
1,../../data/dgfp/data/importance12_districts/mc...,2018-12-28-0114-01.csv,"(33, 72)",33,72,19088,2,True
2,../../data/dgfp/data/importance12_districts/mc...,2018-12-28-0209-02.csv,"(33, 72)",33,72,19088,2,True
3,../../data/dgfp/data/importance12_districts/mc...,2018-12-28-0205-02.csv,"(33, 72)",33,72,19088,2,True
4,../../data/dgfp/data/importance12_districts/mc...,2018-12-28-0108-07.csv,"(33, 72)",33,72,19088,2,True


../../data/dgfp/data/importance12_districts/ngodistrict_monthprocess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance12_districts/ng...,2018-12-28-0305-03.csv,"(133, 1)",133,1,15505,2,True
1,../../data/dgfp/data/importance12_districts/ng...,2018-12-28-0114-01.csv,"(133, 1)",133,1,16542,2,True
2,../../data/dgfp/data/importance12_districts/ng...,2018-12-28-0209-02.csv,"(128, 1)",128,1,15048,2,True
3,../../data/dgfp/data/importance12_districts/ng...,2018-12-28-0205-02.csv,"(124, 1)",124,1,13956,2,True
4,../../data/dgfp/data/importance12_districts/ng...,2018-12-28-0108-07.csv,"(53, 1)",53,1,4305,2,True


../../data/dgfp/data/importance12_districts/distributionGO_distmonthProcess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0305-03.csv,"(135, 27)",135,27,29240,2,True
1,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0114-01.csv,"(135, 27)",135,27,29240,2,True
2,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0209-02.csv,"(135, 27)",135,27,29240,2,True
3,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0205-02.csv,"(135, 27)",135,27,29240,2,True
4,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0108-07.csv,"(136, 27)",136,27,29456,2,True


../../data/dgfp/data/importance12_districts/district_monthprocess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0305-03.csv,"(137, 1)",137,1,17817,2,True
1,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0114-01.csv,"(137, 1)",137,1,17681,2,True
2,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0209-02.csv,"(136, 1)",136,1,17696,2,True
3,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0205-02.csv,"(137, 1)",137,1,17649,2,True
4,../../data/dgfp/data/importance12_districts/di...,2018-12-28-0108-07.csv,"(137, 1)",137,1,17681,2,True


../../data/dgfp/data/importance11_districts/district_process


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance11_districts/di...,2018-1-28-07.csv,"(1, 1)",1,1,308,2,False
1,../../data/dgfp/data/importance11_districts/di...,2012-6-28-07.csv,"(1, 1)",1,1,308,2,False
2,../../data/dgfp/data/importance11_districts/di...,2011-12-28-04.csv,"(1, 1)",1,1,308,2,False
3,../../data/dgfp/data/importance11_districts/di...,2012-11-28-04.csv,"(1, 1)",1,1,308,2,False
4,../../data/dgfp/data/importance11_districts/di...,2014-7-28-06.csv,"(1, 1)",1,1,308,2,False


../../data/dgfp/data/importance11_districts/ngodistrict_process


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance11_districts/ng...,2018-1-28-07.csv,"(1, 1)",1,1,308,2,False
1,../../data/dgfp/data/importance11_districts/ng...,2015-2-28-06(1).csv,"(1, 1)",1,1,308,2,False
2,../../data/dgfp/data/importance11_districts/ng...,2012-6-28-07.csv,"(1, 1)",1,1,308,2,False
3,../../data/dgfp/data/importance11_districts/ng...,2011-12-28-04.csv,"(1, 1)",1,1,308,2,False
4,../../data/dgfp/data/importance11_districts/ng...,2008-6-28-01(1).csv,"(1, 1)",1,1,308,2,False


../../data/dgfp/data/importance11_subdistricts/ngothana_process


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance11_subdistricts...,2012-9-28-0103-07.csv,"(4, 1)",4,1,656,2,False
1,../../data/dgfp/data/importance11_subdistricts...,2013-1-28-0204-02.csv,"(3, 1)",3,1,540,2,False
2,../../data/dgfp/data/importance11_subdistricts...,2010-4-28-0602-06.csv,"(1, 1)",1,1,308,2,False
3,../../data/dgfp/data/importance11_subdistricts...,2013-9-28-0304-03.csv,"(1, 1)",1,1,308,2,False
4,../../data/dgfp/data/importance11_subdistricts...,2011-1-28-0209-02.csv,"(5, 1)",5,1,772,2,False


../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance11_subdistricts...,2018-9-28-0415-08.csv,"(12, 72)",12,72,6992,2,True
1,../../data/dgfp/data/importance11_subdistricts...,2016-2-28-0306-03.csv,"(7, 72)",7,72,4112,2,True
2,../../data/dgfp/data/importance11_subdistricts...,2017-2-28-0101-07.csv,"(13, 72)",13,72,7568,2,True
3,../../data/dgfp/data/importance11_subdistricts...,2018-5-28-0508-05.csv,"(8, 72)",8,72,4688,2,True
4,../../data/dgfp/data/importance11_subdistricts...,2018-7-28-0109-01.csv,"(12, 72)",12,72,6992,2,True


../../data/dgfp/data/importance11_subdistricts/thana_process


Unnamed: 0,file_path,file_name,file_shape,file_nrows,file_ncols,file_memoryusage,file_ndim,large_enough
0,../../data/dgfp/data/importance11_subdistricts...,2012-9-28-0103-07.csv,"(6, 1)",6,1,888,2,False
1,../../data/dgfp/data/importance11_subdistricts...,2013-1-28-0204-02.csv,"(7, 1)",7,1,1004,2,True
2,../../data/dgfp/data/importance11_subdistricts...,2010-4-28-0602-06.csv,"(9, 1)",9,1,1236,2,True
3,../../data/dgfp/data/importance11_subdistricts...,2013-9-28-0304-03.csv,"(5, 1)",5,1,772,2,False
4,../../data/dgfp/data/importance11_subdistricts...,2011-1-28-0209-02.csv,"(10, 1)",10,1,1352,2,True


In [177]:
selected_dirs_list = select_dirs(test)
print(len(selected_dirs_list))
selected_dirs_list

9


['../../data/dgfp/data/importance12_subdistricts/distributionGO_thanaconsuprocess',
 '../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanaconsuprocess',
 '../../data/dgfp/data/importance12_districts/mchgo_distmonthprocess',
 '../../data/dgfp/data/importance12_districts/ngodistrict_monthprocess',
 '../../data/dgfp/data/importance12_districts/distributionGO_distmonthProcess',
 '../../data/dgfp/data/importance12_districts/district_monthprocess',
 '../../data/dgfp/data/importance11_subdistricts/ngothana_process',
 '../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess',
 '../../data/dgfp/data/importance11_subdistricts/thana_process']