# Check downloaded data for completeness
Natalia Vélez, February 2021

In [1]:
import os, re, requests, urllib
from bs4 import BeautifulSoup
from os.path import join as opj
from pathlib import Path
import pandas as pd
from tqdm import notebook

## List local files

Walk through data directory and get file sizes:

In [2]:
def gsearch(*args): return glob.glob(opj(*args))

downloaded_files = []
data_dir = '../../data/'
for path_obj in Path(data_dir).rglob('*.txt'):
    path = str(path_obj)
    f_size = os.path.getsize(path)
    f = path.replace(data_dir, '')
    downloaded_files.append((f, f_size))
    
downloaded_files.sort()
print(*downloaded_files[:10], sep='\n')
print('...')

('familyDataLog.txt', 196608)
('foodLogs/foodLog_bigserver1.onehouronelife.com/2019_01January_25_Friday.txt', 851)
('foodLogs/foodLog_bigserver1.onehouronelife.com/2019_01January_26_Saturday.txt', 50503)
('foodLogs/foodLog_bigserver1.onehouronelife.com/2019_01January_27_Sunday.txt', 52075)
('foodLogs/foodLog_bigserver1.onehouronelife.com/2019_01January_28_Monday.txt', 6813)
('foodLogs/foodLog_bigserver1.onehouronelife.com/2019_01January_29_Tuesday.txt', 50932)
('foodLogs/foodLog_bigserver1.onehouronelife.com/2019_01January_30_Wednesday.txt', 2274)
('foodLogs/foodLog_bigserver2.onehouronelife.com/2019_01January_29_Tuesday.txt', 144)
('foodLogs/foodLog_bigserver2.onehouronelife.com/2019_01January_30_Wednesday.txt', 43965)
('foodLogs/foodLog_bigserver2.onehouronelife.com/2019_01January_31_Thursday.txt', 51790)
...


Assemble into dataframe:

In [3]:
download_df = pd.DataFrame(downloaded_files, columns=['path', 'local_size'])
download_df.head()

Unnamed: 0,path,local_size
0,familyDataLog.txt,196608
1,foodLogs/foodLog_bigserver1.onehouronelife.com...,851
2,foodLogs/foodLog_bigserver1.onehouronelife.com...,50503
3,foodLogs/foodLog_bigserver1.onehouronelife.com...,52075
4,foodLogs/foodLog_bigserver1.onehouronelife.com...,6813


## List server files

Connect to server:

In [4]:
ohol_url = 'http://publicdata.onehouronelife.com/'
exclude_keywords = ['..', 'foodLogs', 'curseLog'] # Directories not to download
server = 'bigserver2' # server to download from 

Find all files on server:

In [5]:
f_names = []
f_sizes = []

def server_file_sizes(url, parent=''):

    paths = []
    # print('new url: %s' % url) # debug only

    response = requests.get(url)
    if response.ok:
        response_text = response.text
    else:
        response.raise_for_status()

    # Get all valid lines from file
    soup = BeautifulSoup(response_text, 'html.parser')
    lines = soup.get_text().replace('\r', '').split('\n')
    lines = [re.split(r' {3,}', l) for l in lines]
    lines = [l for l in lines if len(l) == 3]

    node_isdir = lambda f: f.endswith('/')
    node_keep = lambda f: not any(kwd in f for kwd in exclude_keywords)

    for l in lines:
        f = l[0]

        if not node_isdir(f):
            paths.append((parent+f, int(l[2])))
        else:
            if node_keep(f):
                print('crawl —> %s' % parent+f)

                # Recursive search
                paths += server_file_sizes(url+f, parent+f)
                
    return paths       

In [6]:
server_files = server_file_sizes(ohol_url)
print(*server_files[:10], sep='\n')

crawl —> publicLifeLogData/
crawl —> publicLifeLogData/lifeLog/
crawl —> publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server1.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server10.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server11.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server12.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server13.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server14.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server15.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server2.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server3.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server4.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server5.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server6.onehouronelife.com/
crawl —> publicLifeLogData/lifeLog_server7

In [7]:
server_df = pd.DataFrame(server_files, columns=['path', 'server_size'])
server_df.head()

Unnamed: 0,path,server_size
0,publicLifeLogData/lifeLog_bigserver1.onehouron...,118050
1,publicLifeLogData/lifeLog_bigserver1.onehouron...,4355
2,publicLifeLogData/lifeLog_bigserver1.onehouron...,3377971
3,publicLifeLogData/lifeLog_bigserver1.onehouron...,146267
4,publicLifeLogData/lifeLog_bigserver1.onehouron...,3559492


## Merge dataframes

Line up file sizes on local directory and server:

In [8]:
file_comparison = pd.merge(download_df, server_df, on='path')
file_comparison.head()

Unnamed: 0,path,local_size,server_size
0,familyDataLog.txt,196608,196608
1,publicLifeLogData/lifeLog_bigserver1.onehouron...,118050,118050
2,publicLifeLogData/lifeLog_bigserver1.onehouron...,4355,4355
3,publicLifeLogData/lifeLog_bigserver1.onehouron...,3377971,3377971
4,publicLifeLogData/lifeLog_bigserver1.onehouron...,146267,146267


Check for discrepancies:

In [9]:
file_discrepancies = file_comparison.query('local_size != server_size')
print('%i discrepancies found' % file_discrepancies.shape[0])
print(*file_discrepancies.path, sep='\n')

65 discrepancies found
publicLifeLogData/lifeLog_server1.onehouronelife.com/2019_12December_02_Monday.txt
publicLifeLogData/lifeLog_server1.onehouronelife.com/2019_12December_02_Monday_names.txt
publicLifeLogData/lifeLog_server1.onehouronelife.com/2020_01January_07_Tuesday_names.txt
publicLifeLogData/lifeLog_server1.onehouronelife.com/2020_06June_14_Sunday_names.txt
publicLifeLogData/lifeLog_server10.onehouronelife.com/2019_12December_02_Monday.txt
publicLifeLogData/lifeLog_server10.onehouronelife.com/2020_01January_09_Thursday.txt
publicLifeLogData/lifeLog_server10.onehouronelife.com/2020_01January_09_Thursday_names.txt
publicLifeLogData/lifeLog_server10.onehouronelife.com/2020_06June_12_Friday_names.txt
publicLifeLogData/lifeLog_server11.onehouronelife.com/2019_12December_02_Monday.txt
publicLifeLogData/lifeLog_server11.onehouronelife.com/2019_12December_02_Monday_names.txt
publicLifeLogData/lifeLog_server11.onehouronelife.com/2020_01January_09_Thursday.txt
publicLifeLogData/lifeLog_

Remove files with discrepancies for re-download:

In [10]:
files_to_delete = [opj(data_dir, f) for f in file_discrepancies.path if 'bigserver2' in f]
print('Deleting %i files:' % len(files_to_delete))
print(*files_to_delete, sep='\n')

for f in files_to_delete:
    os.remove(f)

Deleting 0 files:

