# Data Collection

Binary classification of posts that come from the Language Technology subreddit vs the Neuro Linguistic Programming subreddit.


In [None]:
import time
import requests
import random
import numpy as np
import pandas as pd

Important fields: title, selftext and subreddit.

## Scraping Function

In [None]:
def crawl(url, subreddit, agent='Redditor 2.0', cycles=1):
    after = after_store[subreddit]
    posts = []
    
    for _ in range(cycles):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after

        res = requests.get(current_url, headers={'User-agent': agent})
        if res.status_code != 200:
            print('Status error', res.status_code, current_url)
            break
        print('Succesful: ' + current_url)

        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']
        after_store[subreddit] = after

        sleep_duration = random.randint(1,30)
        print('sleep duration: ' + str(sleep_duration))
        time.sleep(sleep_duration)
        
    return posts

In [None]:
def check_posts(posts_list):
    unique_posts = len(set([p['name'] for p in posts_list]))
    print(f'Currently at {len(posts_list)} posts.')
    print(f'{unique_posts} unique posts by name.')

Create some agent names to cycle through.

In [None]:
agents = ['Redditor 1.0', 'Redditor 2.0', 'Redditor 5.0']
# keep redefining list of agents as necessary in subsequent crawls

### Storing After Keys

Use a dictionary to store the reddit 'after' keys so that the crawls can be broken into different sessions.

In [None]:
after_store = {
    'lt': None,
    'nlp': None
}

## Scrape the LanguageTechnology Subreddit

In [None]:
url = 'https://www.reddit.com/r/LanguageTechnology.json'

In [None]:
lt_posts = []

Get first 50 posts (25 posts per cycle)

In [None]:
lt_posts.extend(
    crawl(
        url,
        'lt',
        cycles=2
    )
)

Get 250 more lt posts.

In [None]:
lt_posts.extend(
    crawl(
        url,
        'lt',
        cycles=10
    )
)

Get 500 more lt posts.

In [None]:
lt_posts.extend(
    crawl(
        url,
        'lt',
        cycles=20,
        agent = np.random.choice(agents)
    )
)

Get 500 more lt posts.

In [None]:
lt_posts.extend(
    crawl(
        url,
        'lt',
        cycles=20,
        agent = np.random.choice(agents)
    )
)

In [None]:
check_posts(lt_posts)

Starting to get duplicates...

Try another 100 posts.

In [None]:
lt_posts.extend(
    crawl(
        url,
        'lt',
        cycles=4,
        agent = np.random.choice(agents)
    )
)

In [None]:
check_posts(lt_posts)

Waiting 1 day does not improve the max unique posts.
Day 2: try different endpoint type.

In [None]:
after_store['lt_top'] = None

In [None]:
url_top = 'https://www.reddit.com/r/LanguageTechnology/top.json?t=all&limit=25'

In [None]:
lt_top_posts = []

In [None]:
lt_top_posts.extend(
    crawl(
        url_top,
        'lt_top',
        cycles=4,
        agent = np.random.choice(agents)
    )
)

In [None]:
check_posts(lt_top_posts)

In [None]:
after_store['lt_contr'] = None

In [None]:
url_contr = 'https://www.reddit.com/r/LanguageTechnology/controversial.json?t=all&limit=25'

In [None]:
lt_contr_posts = []

In [None]:
lt_contr_posts.extend(
    crawl(
        url_contr,
        'lt_contr',
        cycles=4,
        agent = np.random.choice(agents)
    )
)

In [None]:
check_posts(lt_contr_posts)

Tried different endpoints on different days. Different end points do not yield a lot more posts.

### Inspect Collected Data in Pandas

In [None]:
df = pd.DataFrame(lt_posts)

In [None]:
df.head()

In [None]:
df['textlen'] = df['selftext'].map(lambda x:len(x))

Check for Dupes

In [None]:
df.shape

In [None]:
df[df['selftext']!=''].shape

In [None]:
df.drop_duplicates('name').shape

In [None]:
df.drop_duplicates('title').shape

In [None]:
# df.to_csv('data/scraped/lt_data_1_160621.csv')

## Scrape the Neuro-Linguistic Programming Subreddit

In [None]:
url2 = 'https://www.reddit.com/r/NLP.json'

In [None]:
after_store

In [None]:
nlp_posts = []

Get first 100 nl posts.

In [None]:
nlp_posts.extend(
    crawl(
        url2,
        'nlp',
        cycles=4,
        agent = np.random.choice(agents)
    )
)

Get next 500 nl posts.

In [None]:
nlp_posts.extend(
    crawl(
        url2,
        'nlp',
        cycles=20,
        agent = np.random.choice(agents)
    )
)

Get next 500 nl posts.

In [None]:
nlp_posts.extend(
    crawl(
        url2,
        'nlp',
        cycles=20,
        agent = np.random.choice(agents)
    )
)

In [None]:
check_posts(nlp_posts)

Starting to face dupes, try for 200 more posts.

In [None]:
nlp_posts.extend(
    crawl(
        url2,
        'nlp',
        cycles=8,
        agent = np.random.choice(agents)
    )
)

### Inspect Collected NL Data in Pandas

In [None]:
df2 = pd.DataFrame(nlp_posts)

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
df2[df2['selftext']!=''].shape

In [None]:
df2[df2['selftext']!=''][['selftext', 'title', 'url']]

In [None]:
df2.drop_duplicates('name').shape

In [None]:
df2.drop_duplicates('title').shape

In [None]:
# df2.to_csv('data/scraped/ud_nl_data_1_160621.csv', index=False)

## Get More Reddit Posts 
For verifying the model with adjacent topics.

In [None]:
after_store.update({'dl':None})

In [None]:
url_dl = 'https://www.reddit.com/r/deeplearning.json'

### Get DeepLearning Posts
Related topic to Language Technology.

In [None]:
dl_posts = []

In [None]:
# get 100 dl posts
dl_posts.extend(
    crawl(
        url_dl,
        'dl',
        cycles=4,
        agent = np.random.choice(agents)
    )
)

In [None]:
# get 100 dl posts
dl_posts.extend(
    crawl(
        url_dl,
        'dl',
        cycles=4,
        agent = np.random.choice(agents)
    )
)

In [None]:
# get 300 dl posts
dl_posts.extend(
    crawl(
        url_dl,
        'dl',
        cycles=12,
        agent = np.random.choice(agents)
    )
)

Save to df.

In [None]:
dl_df = pd.DataFrame(dl_posts)

In [None]:
# dl_df.to_csv('data/scraped/ud_dl_data_1_160621.csv', index=False)

### Get Hypnosis Posts 
Related topic to neurolinguistic programming. 

In [None]:
after_store.update({'hy':None})

In [None]:
url_hy = 'https://www.reddit.com/r/hypnosis.json'

In [None]:
hy_posts = []

In [None]:
# get 200 hy posts
hy_posts.extend(
    crawl(
        url_hy,
        'hy',
        cycles=8,
        agent = np.random.choice(agents)
    )
)

In [None]:
check_posts(hy_posts)

In [None]:
# get 200 hy posts
hy_posts.extend(
    crawl(
        url_hy,
        'hy',
        cycles=8,
        agent = np.random.choice(agents)
    )
)

In [None]:
# get 200 hy posts
hy_posts.extend(
    crawl(
        url_hy,
        'hy',
        cycles=8,
        agent = np.random.choice(agents)
    )
)

In [None]:
# get 200 hy posts
hy_posts.extend(
    crawl(
        url_hy,
        'hy',
        cycles=8,
        agent = np.random.choice(agents)
    )
)

In [None]:
# get 200 hy posts
hy_posts.extend(
    crawl(
        url_hy,
        'hy',
        cycles=8,
        agent = np.random.choice(agents)
    )
)

In [None]:
check_posts(hy_posts)

Save to df.

In [None]:
hy_df = pd.DataFrame(hy_posts)

In [None]:
hy_df.head()

In [None]:
# hy_df.to_csv('data/scraped/ud_hy_data_1_160621.csv', index=False)