# Data Collection

This notebook will step through the process of gathering data from Reddit

### Headers

In [3]:
# Import required headers
import requests
import time
import pandas as pd

### Functions

`get_reddit` will get just over 1000 posts from the given sub, and return a list of the json data

`write_reddit` will step through the json and pull the relevant data out, and store it a dataframe.

In [5]:
# get the list of posts from the reddit subs
def get_reddit(url):

# Initialize json holder
    posts = []

# Initialize after param
    after = None
    
# Create a custom header
    headers = {'User-agent': 'Elliott 1.0'}

# Get ~1000 posts
    for i in range(50):
        if i % 5 == 0:
            print(f'{i} posts have been collected.')
        if after == None:
            params = {}
        else:
            params = {'after': after}
        
        res = requests.get(url, params=params, headers=headers)
        if res.status_code == 200:
            json = res.json()
            posts.extend(json['data']['children'])
            after = json['data']['after'] 
        else:
            print(f'Error! Status code: {res.status_code}')
        break 
              
        time.sleep(2)
        
    return posts

In [6]:
# write the data into a dataframe and then into a csv file

def write_reddit(posts, csvfilename):

# Initialize our corpus
    corpus = [[]]

# Step through all the posts, pulling the relevant data.
# Data is stored in a list, or document, which is then appended onto the corpus
    for i in range(len(posts)):
        document = []
        document.append(posts[i]['data']['title'])
        document.append(posts[i]['data']['selftext'])
        document.append(posts[i]['data']['author'])
        document.append(posts[i]['data']['subreddit_name_prefixed'])
    
        corpus.append(document)
    
    
# Convert to a dataframe to allow easy CSV writting    
    df = pd.DataFrame(corpus, columns=['title','text','author','sub'])

# change the sub to something shorter
    df['sub'] = df['sub'].map(
              {'r/motorcycles':'mc',
               'r/MLS': 'mls',
               'r/soccer':'fb',
               'r/SoundersFC':'ssfc',
               'r/TalesFromRetail':'tfr',
               'r/TalesFromYourServer':'tfys',
               'r/bartenders':'bar'})
        
    
# Drop all the duplicates
    df.drop_duplicates(inplace=True)
    
# Save that file off to disk!
    df.to_csv(csvfilename)
    
    return None

## Main

Step through the list of subreddits I want to use. Grab the posts and then shove them into a dataframe.

In [5]:
# A list of the subs I'm going to hit
sub_list = ['mc','fb','mls','ssfc','tfr','tfys', 'bar']

for sub in sub_list:
    if sub == 'mc':
        csvfilename = '../datasets/motorcycles.csv'
        url = 'https://www.reddit.com/r/motorcycles.json'
    elif sub == 'fb':
        csvfilename = '../datasets/soccer.csv'
        url = 'https://www.reddit.com/r/soccer.json'
    elif sub == 'mls':
        csvfilename = './datasets/mls.csv'
        url = 'https://www.reddit.com/r/mls.json'
    elif sub == 'ssfc':
        csvfilename = '../datasets/sounders.csv'
        url = 'https://www.reddit.com/r/SoundersFC.json'
    elif sub == 'tfr':
        csvfilename = '../datasets/tfr.csv'
        url = 'https://www.reddit.com/r/TalesFromRetail.json'
    elif sub == 'bar':
        csvfilename = '../datasets/bar.csv'
        url = 'https://www.reddit.com/r/bartenders.json'
    else:
        csvfilename = '../datasets/tfys.csv'
        url = 'https://www.reddit.com/r/TalesFromYourServer.json'
        
# Get the data from reddit
    posts = get_reddit(url)
    
# Write the data to csv
    write_reddit(posts, csvfilename)