In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dateutil.parser import parse
from tqdm import tqdm

In [2]:
# Functions
def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except (ValueError, TypeError):
        return False
    
def bucket_csvs(src_path, bucket_path):
    src_dir = Path(src_path)
    bucket_dir = Path(bucket_path)
    bucket_dir.mkdir(exist_ok=True, parents=True)

    # get file paths
    files = list(src_dir.rglob('*.csv'))
    for file in files:
        print('Processing: {}'.format(file.name))

        # load file
        df = pd.read_csv(file)

        # iterate over rows
        for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Rows'):
            date = row['Date']

            # drop rows with no dates
            if not is_date(date):
                df.drop(index, inplace=True)
                continue

            date = date.split(' ')[0]
            date_time_obj = datetime.strptime(date, '%Y-%m-%d')
            day_no = date_time_obj.weekday()

            # if weekend, subtract days to make it Friday
            if day_no  == 5:
                new_date = date_time_obj - timedelta(days=1)
                row['Date'] = new_date.strftime('%Y-%m-%d')
            elif day_no == 6:
                new_date = date_time_obj - timedelta(days=2)
                row['Date'] = new_date.strftime('%Y-%m-%d')

        # save new file
        df.to_csv(Path(bucket_dir, file.name), index=False)

    print('Finished bucketing: {}\n'.format(src_path))
    
def change_date(date):
    # drop rows with no dates
    if not is_date(date):
        return 0

    date = date.split(' ')[0]
    date_time_obj = datetime.strptime(date, '%Y-%m-%d')
    day_no = date_time_obj.weekday()

    # if weekend, subtract days to make it Friday
    if day_no  == 5:
        new_date = date_time_obj - timedelta(days=1)
        return new_date.strftime('%Y-%m-%d')
    elif day_no == 6:
        new_date = date_time_obj - timedelta(days=2)
        return new_date.strftime('%Y-%m-%d')
    else:
        return date
    
def bucket_csvs2(src_path, bucket_path):
    src_dir = Path(src_path)
    bucket_dir = Path(bucket_path)
    bucket_dir.mkdir(exist_ok=True, parents=True)

    # get file paths
    files = list(src_dir.rglob('*.csv'))
    for file in files:
        print('Processing: {}'.format(file.name))

        # load file
        df = pd.read_csv(file)
        df['Date'] = df['Date'].map(change_date)
        df = df[df['Date'] != 0]

        # save new file
        df.to_csv(Path(bucket_dir, file.name), index=False)

    print('Finished bucketing: {}\n'.format(src_path))

In [3]:
# Run bucketing
bucket_csvs2('data/in/news/', 'data/bucket/news/')
bucket_csvs2('data/in/twitter/', 'data/bucket/twitter/')

Processing: Business Insider.csv
Processing: CNN.csv
Processing: CNN_sentiment.csv
Processing: New York Times.csv
Processing: New York Times_sentiment.csv
Processing: NYT_ALL.csv
Processing: Washington Post.csv
Processing: Washington Post_sentiment.csv
Finished bucketing: data/in/news/

Processing: ArianaGrande.csv
Processing: BarackObama.csv
Processing: BarackObama_sentiment.csv
Processing: britneyspears.csv
Processing: cnnbrk.csv
Processing: cnnbrk_sentiment.csv
Processing: Cristiano.csv
Processing: ddlovato.csv
Processing: instagram.csv
Processing: jimmyfallon.csv
Processing: jtimberlake.csv
Processing: justinbieber.csv
Processing: katyperry.csv
Processing: KimKardashian.csv
Processing: KimKardashian_sentiment.csv
Processing: ladygaga.csv
Processing: rihanna.csv
Processing: selenagomez.csv
Processing: shakira.csv
Processing: taylorswift13.csv
Processing: TheEllenShow.csv
Processing: Trump.csv
Processing: Trump_sentiment.csv
Processing: Twitter.csv
Processing: YouTube.csv
Finished bu