In [1]:
import glob
import pandas as pd
import pytz
from tzlocal import get_localzone
from dateutil.relativedelta import relativedelta
import datetime
%matplotlib inline

In [2]:
def get_utc(date, subtract_months=0):
    """
    Get date in utc, based on timezone of computer where script is running on
    
    Parameters
    ----------
    date : datetime object
        example date = datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
    subtract_months : integer
        months to subtract from the date
    """    
    tz = get_localzone()
    local_dt = tz.localize(date, is_dst=None)
    utc_dt = local_dt.astimezone(pytz.utc)
    if subtract_months != 0:
        utc_dt = utc_dt - relativedelta(months=+subtract_months)
    #logging.debug("{0} : {1}".format('get_utc            ', utc_dt.strftime("%Y-%m-%d %H:%M:%S")))
    return utc_dt

In [3]:
def gpmlist2dataframe(gpm_files):
    """
    gpm files to pandas dataframe with columns for the start and end date
    start date is set as index

    Parameters
    ----------
    gpm_files : list
        list with gpm file paths
    """
    columns = ['date','start','end']
    s = pd.Series(sorted(gpm_files))
    df = s.str.replace('[SE]','').str.split('.',7).str[4].str.split('-',2, expand=True)
    df.columns = columns
    df['date_start'] = pd.to_datetime(df['date'].map(str) + df['start'].map(str))
    df['date_end'] = pd.to_datetime(df['date'].map(str) + df['end'].map(str))
    df.drop(columns, inplace=True, axis=1)
    df['name'] = sorted(gpm_files)
    df.set_index('date_start', inplace=True)
    df = df.tz_localize('UTC')
    return df

In [13]:
path_in = r'D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm'
days_no = 1
gpm_files = glob.glob('{0}/*'.format(path_in))

In [14]:
gpm_files

['D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S073000-E075959.0450.V05B.RT-H5',
 'D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S080000-E082959.0480.V05B.RT-H5',
 'D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S083000-E085959.0510.V05B.RT-H5',
 'D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S090000-E092959.0540.V05B.RT-H5',
 'D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S093000-E095959.0570.V05B.RT-H5',
 'D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S100000-E102959.0600.V05B.RT-H5',
 'D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S103000-E105959.0630.V05B.RT-H5',
 'D:\\My Projects\\gpm2thredds\\trunk\\test\\test_fetch_gpm\\3B-HHR-E.MS.MRG.3IMERG.20180110-S110000-E112959.06

In [15]:
df = gpmlist2dataframe(gpm_files)
t_date = get_utc(datetime.datetime.now()) - relativedelta(days=days_no)

In [16]:
string = 'NC.S20171127-0830.AM-E20171127-1030.PM-P02.HOUR-bd.nc'

In [17]:
'P{0}'.format(string.split('M-P')[1])

'P02.HOUR-bd.nc'

In [18]:
index_date = df.index.get_loc(t_date,method='nearest')
sel_df = df.iloc[:index_date]

for name_path in sel_df['name']:
    print(name_path)
    #os.remove(name_path)

D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S073000-E075959.0450.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S080000-E082959.0480.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S083000-E085959.0510.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S090000-E092959.0540.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S093000-E095959.0570.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S100000-E102959.0600.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S103000-E105959.0630.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS.MRG.3IMERG.20180110-S110000-E112959.0660.V05B.RT-H5
D:\My Projects\gpm2thredds\trunk\test\test_fetch_gpm\3B-HHR-E.MS

In [19]:
sel_df.head()

Unnamed: 0_level_0,date_end,name
date_start,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-10 07:30:00+00:00,2018-01-10 07:59:59,D:\My Projects\gpm2thredds\trunk\test\test_fet...
2018-01-10 08:00:00+00:00,2018-01-10 08:29:59,D:\My Projects\gpm2thredds\trunk\test\test_fet...
2018-01-10 08:30:00+00:00,2018-01-10 08:59:59,D:\My Projects\gpm2thredds\trunk\test\test_fet...
2018-01-10 09:00:00+00:00,2018-01-10 09:29:59,D:\My Projects\gpm2thredds\trunk\test\test_fet...
2018-01-10 09:30:00+00:00,2018-01-10 09:59:59,D:\My Projects\gpm2thredds\trunk\test\test_fet...


In [21]:
path_in = r'D:\My Projects\gpm2thredds\trunk\test\test_to_thredds'
days_no = 1
nc_files = glob.glob('{0}/*'.format(path_in))

In [23]:
s = pd.Series(sorted(nc_files))

In [66]:
df = s.str.replace('[SEAPM]','').str.split('.',7).str[1:3].str.join('').str.split('-',3, expand=True)

In [67]:
df.iloc[0]

0    20180107
1        1130
2    20180108
3        1130
Name: 0, dtype: object

In [70]:
df_nc = nclist2dataframe(nc_files)
t_date = get_utc(datetime.datetime.now()) - relativedelta(days=days_no)

In [71]:
index_date = df_nc.index.get_loc(t_date,method='nearest')
sel_df_nc = df_nc.iloc[:index_date]

for name_path in sel_df_nc['name']:
    print(name_path)
    #os.remove(name_path)

D:\My Projects\gpm2thredds\trunk\test\test_to_thredds\NC.S20180107-1130.AM-E20180108-1130.AM-P24.HOUR-bangladesh.nc
D:\My Projects\gpm2thredds\trunk\test\test_to_thredds\NC.S20180107-2330.PM-E20180108-1130.AM-P12.HOUR-bangladesh.nc
D:\My Projects\gpm2thredds\trunk\test\test_to_thredds\NC.S20180108-0330.AM-E20180108-1130.AM-P08.HOUR-bangladesh.nc
D:\My Projects\gpm2thredds\trunk\test\test_to_thredds\NC.S20180108-0930.AM-E20180108-1130.AM-P02.HOUR-bangladesh.nc
D:\My Projects\gpm2thredds\trunk\test\test_to_thredds\NC.S20180110-0730.AM-E20180111-0730.AM-P24.HOUR-bangladesh.nc


In [68]:
def nclist2dataframe(nc_files):
    """
    nc files to pandas dataframe with columns for the start and end date
    start date is set as index

    Parameters
    ----------
    nc_files : list
        list with converted nc file paths
    """
    columns = ['start_date','start_time','end_date','end_time']
    s = pd.Series(sorted(nc_files))
    df = s.str.replace('[SEAPM]','').str.split('.',7).str[1:3].str.join('').str.split('-',3, expand=True)
    df.columns = columns
    df['date_start'] = pd.to_datetime(df['start_date'].map(str) + df['start_time'].map(str))
    df['date_end'] = pd.to_datetime(df['end_date'].map(str) + df['end_time'].map(str))
    df.drop(columns, inplace=True, axis=1)
    df['name'] = sorted(nc_files)
    df.set_index('date_start', inplace=True)
    df = df.tz_localize('UTC')
    return df

In [69]:
nclist2dataframe(nc_files)

Unnamed: 0_level_0,date_end,name
date_start,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-07 11:30:00+00:00,2018-01-08 11:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
2018-01-07 23:30:00+00:00,2018-01-08 11:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
2018-01-08 03:30:00+00:00,2018-01-08 11:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
2018-01-08 09:30:00+00:00,2018-01-08 11:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
2018-01-10 07:30:00+00:00,2018-01-11 07:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
2018-01-10 19:30:00+00:00,2018-01-11 07:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
2018-01-10 23:30:00+00:00,2018-01-11 07:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
2018-01-11 05:30:00+00:00,2018-01-11 07:30:00,D:\My Projects\gpm2thredds\trunk\test\test_to_...
