In [1]:
import os
import time
import sys
import datetime
from pathlib import Path
from stat import S_ISREG, ST_CTIME, ST_MODE
import requests, re
from bs4 import BeautifulSoup as BS
from pprint import pprint
import pandas as pd

In [17]:
# identify filename and date of file in directory
dir_path = Path.home() / 'data_downloads' / 'noaa_daily_avg_temps' / '1929'
local_list = []
for file_name in os.listdir(dir_path):
    date = os.stat(os.path.join(dir_path, file_name)).st_ctime
    local_list.append(('local', file_name, str(datetime.datetime.fromtimestamp(date))))

# identify filename and file of file in cloud
cloud_list = []
file, date = None, None
base_url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access'
year = '1929'

response = requests.get('https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929')
parsed_html = BS(response.content, 'html.parser')
for item in parsed_html('tr'):
    href = item('a')
    file = href[0].get_text() if href and '.csv' in href[0].get_text() else None
    td = item('td', {'align': 'right'})
    date = td[0].get_text() if td and re.match(r'\d\d\d\d-\d\d-\d\d', td[0].get_text()) else None
    if date and file:
        cloud_list.append(('cloud', file, date))
        
# TEST only
# add new tuple to list to create a different (for testing)
local_list.append(('local', 'newname.csv', '2020-08-37 21:33:33.544504'))

# convert both lists into dataframes
cloud_df = pd.DataFrame(cloud_list, columns = ['type', 'filename', 'cloud_date'])
local_df = pd.DataFrame(local_list, columns = ['type', 'filename', 'local_date'])

# compare the dataframes and see if there are files in cloud that aren't in local
diff_df = pd.concat([local_df, cloud_df]).drop_duplicates(subset = ['filename'], keep=False)
print('new files', len(diff_df))

# create new file_df dataframe that matches the two separate ones
file_df = pd.DataFrame(columns = ['type', 'filename', 'cloud_date'])
# add first dataframe to file_df
file_df = file_df.append(cloud_df)
print(file_df)
# add dates second dataframe to a new column on file_df where filenames match
file_df['local_date'] = file_df['filename'].map(local_df.set_index('filename')['local_date'])
print(file_df)

list_of_names = file_df['filename'].to_list()

# evaluate if any files in the cloud have changed since they were originally downloaded locally
changes_df = file_df[(file_df['cloud_date'] > file_df['local_date'])]
print('changes to exising', len(changes_df))

# combine changes_df and diff_df to get all file that need to be downloaded, or re-downloaded
download_df = changes_df.append(diff_df)
print('number to download', len(download_df))

return set(download_df['filename'].to_list())

new files 1
     type         filename          cloud_date
0   cloud  03005099999.csv  2019-01-19 12:37  
1   cloud  03075099999.csv  2019-01-19 12:37  
2   cloud  03091099999.csv  2019-01-19 12:37  
3   cloud  03159099999.csv  2019-01-19 12:37  
4   cloud  03262099999.csv  2019-01-19 12:37  
5   cloud  03311099999.csv  2019-01-19 12:37  
6   cloud  03379099999.csv  2019-01-19 12:37  
7   cloud  03396099999.csv  2019-01-19 12:37  
8   cloud  03497099999.csv  2019-01-19 12:37  
9   cloud  03601099999.csv  2019-01-19 12:37  
10  cloud  03777099999.csv  2019-01-19 12:37  
11  cloud  03795099999.csv  2019-01-19 12:37  
12  cloud  03804099999.csv  2019-01-19 12:37  
13  cloud  03811099999.csv  2019-01-19 12:37  
14  cloud  03856099999.csv  2019-01-19 12:37  
15  cloud  03864099999.csv  2019-01-19 12:37  
16  cloud  03894099999.csv  2019-01-19 12:37  
17  cloud  03953099999.csv  2019-01-19 12:37  
18  cloud  03973099999.csv  2019-01-19 12:37  
19  cloud  03980099999.csv  2019-01-19 12:37  
2

{'newname.csv'}

In [30]:
# identify filename and date of file in directory
dir_path = Path.home() / 'data_downloads' / 'noaa_daily_avg_temps' / '1929'
local_list = []
for file_name in os.listdir(dir_path):
    date = os.stat(os.path.join(dir_path, file_name)).st_ctime
    local_list.append(('local', file_name, str(datetime.datetime.fromtimestamp(date))))

In [37]:
# identify filename and file of file in cloud
cloud_list = []
file, date = None, None
base_url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access'
year = '1929'

response = requests.get('https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929')
parsed_html = BS(response.content, 'html.parser')
for item in parsed_html('tr'):
    href = item('a')
    file = href[0].get_text() if href and '.csv' in href[0].get_text() else None
    td = item('td', {'align': 'right'})
    date = td[0].get_text() if td and re.match(r'\d\d\d\d-\d\d-\d\d', td[0].get_text()) else None
    if date and file:
        cloud_list.append(('cloud', file, date))

In [32]:
# TEST only
# add new tuple to list to create a different (for testing)
local_list.append(('local', 'newname.csv', '2020-08-17 21:33:33.544504'))

In [33]:
# convert both lists into dataframes
cloud_df = pd.DataFrame(cloud_list, columns = ['type', 'filename', 'date'])
local_df = pd.DataFrame(local_list, columns = ['type', 'filename', 'date'])

In [34]:
# compare the dataframes and see if there are files in cloud that aren't in local
diff_df = pd.concat([local_df, cloud_df]).drop_duplicates(subset = ['filename'], keep=False)
len(diff_df)

1

In [18]:
# create new file_df dataframe that matches the two separate ones
file_df = pd.DataFrame(columns = ['type', 'filename', 'date'])
# add first dataframe to file_df
file_df = file_df.append(cloud_df)
# add dates second dataframe to a new column on file_df where filenames match
file_df['local_date'] = file_df['filename'].map(local_df.set_index('filename')['date'])

In [19]:
file_df

Unnamed: 0,type,filename,date,local_date
0,cloud,03005099999.csv,2019-01-19 12:37,2020-08-17 21:33:35.744025
1,cloud,03075099999.csv,2019-01-19 12:37,2020-08-17 21:33:32.744734
2,cloud,03091099999.csv,2019-01-19 12:37,2020-08-17 21:33:32.469024
3,cloud,03159099999.csv,2019-01-19 12:37,2020-08-17 21:33:33.838742
4,cloud,03262099999.csv,2019-01-19 12:37,2020-08-17 21:33:34.383386
5,cloud,03311099999.csv,2019-01-19 12:37,2020-08-17 21:33:31.891959
6,cloud,03379099999.csv,2019-01-19 12:37,2020-08-17 21:33:34.906430
7,cloud,03396099999.csv,2019-01-19 12:37,2020-08-17 21:33:34.115695
8,cloud,03497099999.csv,2019-01-19 12:37,2020-08-17 21:33:36.032973
9,cloud,03601099999.csv,2019-01-19 12:37,2020-08-17 21:33:34.649159


In [12]:
# evaluate if any files in the cloud have changed since they were originally downloaded locally
changes_df = file_df[(file_df['date'] > file_df['local_date'])]
len(changes_df)

0

In [13]:
# combine changes_df and diff_df to get all file that need to be downloaded, or re-downloaded
download_df = changes_df.append(diff_df)
print(len(download_df))
download_df

1


Unnamed: 0,type,filename,date,local_date
21,local,newname.csv,2020-08-17 21:33:33.544504,


In [38]:
requests.get(base_url + '/' + year + '/99006199999.csv')

<Response [200]>