In [37]:
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dateutil.parser import parse

In [43]:
def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except (ValueError, TypeError):
        return False

In [44]:
# bucket news first
news_dir = Path('data/in/news/')
bucket_news_dir = Path('data/bucket/news/')
bucket_news_dir.mkdir(exist_ok=True, parents=True)

# get file paths
files = list(news_dir.rglob('*.csv'))
for file in files:
    # load file
    df = pd.read_csv(file)
    
    # iterate over rows
    for index, row in df.iterrows():
        date = row['Date']
        
        # drop rows with no dates
        if not is_date(date):
            df.drop(index, inplace=True)
            continue
            
        date_time_obj = datetime.strptime(date, '%Y-%m-%d')
        day_no = date_time_obj.weekday()
        
        # if weekend, subtract days to make it Friday
        if day_no  == 5:
            new_date = date_time_obj - timedelta(days=1)
            row['Date'] = new_date.strftime('%Y-%m-%d')
        elif day_no == 6:
            new_date = date_time_obj - timedelta(days=2)
            row['Date'] = new_date.strftime('%Y-%m-%d')
            
    # save new file
    df.to_csv(Path(bucket_news_dir, file.name), index=False)

In [None]:
# bucket tweets