# Data Collection and Uploading to AWS
In this notebook, I collect Reddit Posts with Pushshift API and upload them to AWS S3 bucket.
Collect Data with Pushshift API and upload Data to AWS S3 bucket.

In [6]:
import pandas as pd
from pmaw import PushshiftAPI
from scrape_reddit import *

api = PushshiftAPI()

In [3]:
def scrape(subreddit, limit, before_date, after_date):
    '''
    Input:
        subreddit(str): the redit handle
        limit(int): the limit of scraping result
        before_date(str): format '2021-02-01'
        after_date(str): format '2020-12-01'
    Output: A csv file of scraped result
    Return: Pandas dataframe of scraped result
    '''
    before = int(dt.datetime.strptime(before_date, '%Y-%m-%d').timestamp())
    after = int(dt.datetime.strptime(after_date, '%Y-%m-%d').timestamp())

    submissions = api.search_submissions(subreddit=subreddit, 
                                   limit=limit, 
                                   before=before, 
                                   after=after)

    print(f'Retrieved {len(submissions)} submissions from Pushshift')

    f = f'{subreddit}_{after_date}_{before_date}.csv'
    submissions_df = pd.DataFrame(submissions)
    submissions_df.to_csv(f, header=True, index=False)

In [4]:
# Only run this once to scrape data--cost long time to run
# scrape(subreddit='personalfinance', limit=10000000, before_date='2020-01-01', after_date='2021-01-01')

## Upload Files to S3
In this part, I used boto3 to, create a buckte `large-scale-computing-personal-finance` upload files to it. (Though, I found mannually uploading files is very convenient, too)

In [14]:
import boto3
import os

In [15]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [9]:
bucket = s3.create_bucket(Bucket='large-scale-computing-personal-finance')

In [11]:
# check if it's already there
bucket_response = s3.list_buckets()
buckets = bucket_response['Buckets']
print(buckets)

[{'Name': 'aws-emr-resources-580185021087-us-east-1', 'CreationDate': datetime.datetime(2021, 5, 12, 12, 17, 32, tzinfo=tzutc())}, {'Name': 'aws-logs-580185021087-us-east-1', 'CreationDate': datetime.datetime(2021, 5, 12, 12, 17, 32, tzinfo=tzutc())}, {'Name': 'large-scale-computing-personal-finance', 'CreationDate': datetime.datetime(2021, 6, 2, 13, 52, 56, tzinfo=tzutc())}]


In [13]:
directory = 'data/flair'

for filename in os.listdir(directory):
    if filename.endswith(".csv"): 
        key = filename
        local_path = os.path.join('data/flair', filename)
        s3.upload_file(Filename=local_path, 
               Bucket='large-scale-computing-personal-finance',
               Key = key)

In [16]:
# list objects in a bucket
response = s3.list_objects(
            Bucket='large-scale-computing-personal-finance',
            MaxKeys=20)
print(len(response['Contents']))

20


In [17]:
response['Contents']

[{'Key': 'Auto.csv',
  'LastModified': datetime.datetime(2021, 6, 2, 15, 4, 48, tzinfo=tzutc()),
  'ETag': '"81177abeff9f859ef267a19a836f87f6"',
  'Size': 7675148,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'awslabsc0w2127751t1616889910',
   'ID': '787d08c84da3b2a4683006801a9d870fccb9d403324a6b5b6c6b37e8687c7895'}},
 {'Key': 'Budgeting.csv',
  'LastModified': datetime.datetime(2021, 6, 2, 15, 5, 21, tzinfo=tzutc()),
  'ETag': '"1afd4fb7c6df0387e6f7272255a68913"',
  'Size': 4315060,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'awslabsc0w2127751t1616889910',
   'ID': '787d08c84da3b2a4683006801a9d870fccb9d403324a6b5b6c6b37e8687c7895'}},
 {'Key': 'Credit.csv',
  'LastModified': datetime.datetime(2021, 6, 2, 15, 5, 40, tzinfo=tzutc()),
  'ETag': '"3a181cf743e2753fac1dd85fd5f5b042-2"',
  'Size': 12214040,
  'StorageClass': 'STANDARD',
  'Owner': {'DisplayName': 'awslabsc0w2127751t1616889910',
   'ID': '787d08c84da3b2a4683006801a9d870fccb9d403324a6b5b6c6b37e8687c789