# Data scraping and storing
This file is used to scrape data from reddit, put it in the right format, and save it in our database

In [1]:
# imports
import pandas as pd
import requests
import time
import os # to check for file existance

## Functions
---


In [2]:
def extract_listings(res, nskip = 0):
    """
    This funciton accept the get object from an api request in reddit and extracts necessary information in the form of dataframe
    Args:
        res (request model response): request.get object from reddit 
        nskip (int): number of records to skip in each request (not necessary info), default it 0 for reddit
    Return:
        dataframe: a datafram with the following columns generated from the request ['text', 'title', 'listingid', 'created', 'url', 'media']
    """
    pass

    records = []
    for listing in res.json()['data']['children']:
        
        # only use the information you need from the each listing  
        text = listing['data']['selftext']
        title = listing['data']['title']
        listingid = listing['data']['id']
        created = listing['data']['created']
        url = listing['data']['url']
        #media_embed = listing['data']['media_embed']
        media = listing['data']['media']

        records.append([text, title, listingid, created, url, media])

    return pd.DataFrame(records, columns=['text', 'title', 'listingid', 'created', 'url', 'media'])
    

## Data scraping scripts
---

In [3]:
# read person information
# Using readlines()
file1 = open('./../personal/my_data.txt', 'r')
Lines = file1.readlines()
 
# Strips the newline character
personal = []
for line in Lines:
    personal.append(line.split('=')[1].replace('\n', ''))

client_id = personal[0] #alphanumeric string provided under "personal use script"
client_secret = personal[1] #alphanumeric string provided as "secret"
user_agent = personal[2] #the name of your application
username =  personal[3] #your reddit username
password =  personal[4] #your reddit password

file1.close()

In [4]:
# use basic authentication framework
auth = requests.auth.HTTPBasicAuth(client_id, client_secret)

data = {
    'grant_type': 'password',
    'username': username,
    'password': password
}

In [5]:
#create an informative header for your application
headers = {'User-Agent': 'massiproj3/0.0.1'}

res = requests.post(
    'https://www.reddit.com/api/v1/access_token',
    auth=auth,
    data=data,
    headers=headers)

print(res)

<Response [200]>


In [6]:
#retrieve access token
token = res.json()['access_token']
# add access token to the header file
headers['Authorization'] = f'bearer {token}'

requests.get('https://oauth.reddit.com/api/v1/me', headers=headers).status_code == 200

True

In this block of the code, we will take a list of the subreddits we are interested in. If there is no previous information collected from those subreddits, we will create a new .csv file and start collecting the data. If we already have some data from that subreddit, it will just append the new data to what we already have. It is recommended that we run this block of the code every 2-3 days so that new information will be fetched and recorded. 
Subreddits with similarities (based on users posting in subreddits, source: https://anvaka.github.io/sayit/?query=dating_advice):
- offmychest, askreddit, nostupidquestions, dating_advice, relationship_advice
- legaladvice, nostupidquestions
- dating_advice, relationship_advice 
- investing, wallstreetbets (just titles, no text as a lot of pics exist)
- politics

In [7]:


subreddits_to_check = ['offmychest', 'trueoffmychest', 'askreddit', 'nostupidquestions', 
                       'dating_advice', 'relationship_advice',
                       'legaladvice', 
                       'politics', 
                       'investing', 'wallstreetbets']
subredits_total_recs = []
req_per_day = 9 # number of inqueries for each subreddit every time we run this script

# This section collects the 1000 available records for a given subreddit
# Note that this block of code will run just one time to initially retrive 1000 records 
# for each of the subreddits in the list and create a .csv file for it. 
# this does not run for subreddits that already have a .csv file associated to them. instead, the 
# next block of the code will run for those. 

for subreddit in subreddits_to_check:
    path = './../dataset/'+ subreddit+'.csv'
    count = 0
    if not os.path.isfile(path):    # continue only if the file does not exist
        print(f'++> extracting initial information for subreddit {subreddit}')
        # read the first 100
        url = 'https://oauth.reddit.com/r/'+subreddit+'/new'
        params = {
            'limit': 100
        # 'after': <-- will be important for getting the 'next' posts
        }
        res = requests.get(url, 
                        headers=headers,
                        params=params)
        count += 1
        print(f'request.get() status for the {count}"s call for subreddit {subreddit} is {res.status_code}')
        bottom_of_listing = res.json()['data']['after']
        df = extract_listings(res, nskip = 0)
        # continue reading the rest of 900 in batches of 100
        for i in range(9):
            params = {'limit': 100, 
                    'after': bottom_of_listing   # this will read 100 records under the first one
                    }
            res = requests.get(url, 
                            headers=headers,
                            params=params) 
            count += 1
            print(f'request.get() status for the {count}"s call for subreddit {subreddit} is {res.status_code}')
            bottom_of_listing = res.json()['data']['after']
            df = pd.concat([df, extract_listings(res, nskip = 0)], axis=0) 
        
        print(f'df shape before dropping the repeats {df.shape}')
        df = df.drop_duplicates(subset='listingid', keep='first')
        print(f'df shape after dropping the repeats {df.shape}')
        df.to_csv(path, index=False)
    else:
        print(f'++> no initial data extraction for subreddit {subreddit} as related .csv file exists in the folder')


# this section of the code is intended to run every2-3 days to retrieve new information for the given list of 
# the subreddits of interest. the new information will be appended to the list of the .csv files we currently have 
# and will be saved into the appropriate .csv file
        
for subreddit in subreddits_to_check:   
    path = './../dataset/'+ subreddit+'.csv'
    count = 0 
    df = pd.read_csv(path) # get the current information we have

    print(f'==> extracting daily information for subreddit {subreddit}')
    # read the first 100
    url = 'https://oauth.reddit.com/r/'+subreddit+'/new'
    params = {
        'limit': 100
    # 'after': <-- will be important for getting the 'next' posts
    }
    res = requests.get(url, 
                       headers=headers,
                       params=params)
    count += 1
    print(f'request.get() status for the {count}"s call for subreddit {subreddit} is {res.status_code}')
    bottom_of_listing = res.json()['data']['after']
    temp = extract_listings(res, nskip = 0)
    # continue reading the rest of 900 in batches of 100
    for i in range(req_per_day-1):
        params = {'limit': 100, 
                  'after': bottom_of_listing   # this will read 100 records under the first one
                  }
        res = requests.get(url, 
                        headers=headers,
                        params=params) 
        count += 1
        print(f'request.get() status for the {count}"s call for subreddit {subreddit} is {res.status_code}')
        bottom_of_listing = res.json()['data']['after']
        temp = pd.concat([temp, extract_listings(res, nskip = 0)], axis=0) 
    temp = temp.drop_duplicates(subset='listingid', keep='first')
    old_size = df.shape[0]
    df = pd.concat([df, temp], axis=0) 
    df = df.drop_duplicates(subset='listingid', keep='first')

    print(f'a total of {df.shape[0]-old_size} new listings was added to {subreddit} subreddit')
    print(f'{subreddit}.csv file has {df.shape[0]} records now')
    subredits_total_recs.append(df.shape[0])

    df.to_csv(path, index=False)

print(f'=====> total number of records collected so far <=====')
[print(i) for i in zip(subreddits_to_check, subredits_total_recs)];


++> no initial data extraction for subreddit offmychest as related .csv file exists in the folder
++> no initial data extraction for subreddit trueoffmychest as related .csv file exists in the folder
++> no initial data extraction for subreddit askreddit as related .csv file exists in the folder
++> no initial data extraction for subreddit nostupidquestions as related .csv file exists in the folder
++> no initial data extraction for subreddit dating_advice as related .csv file exists in the folder
++> no initial data extraction for subreddit relationship_advice as related .csv file exists in the folder
++> no initial data extraction for subreddit legaladvice as related .csv file exists in the folder
++> no initial data extraction for subreddit politics as related .csv file exists in the folder
++> no initial data extraction for subreddit investing as related .csv file exists in the folder
++> no initial data extraction for subreddit wallstreetbets as related .csv file exists in the fol