- scrape data from https://socialgrep.com/ (how to search : https://socialgrep.com/search )
- the page gives older posts of subreddit that the original reddit API doesn't give
- e.g. https://socialgrep.com/search?query=%2Fr%2FLanguageTechnology%2Cbefore%3A2023-11-01&order_by=newest
- it means : posts of subreddit `LanguageTechnology` until `2023-11-01 23:59`
- since this site uses JavaScript to show the search result, `Selenium` is necessary to scrape data
- original subreddit URL is https://www.reddit.com/r/LanguageTechnology/

## import

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from datetime import timedelta
import pandas as pd
from tqdm import tqdm
from time import sleep
import json, os

## function to get the day before
def get_day_before(date:str): # 2020-11-01 -> 2020-10-31
    return str(pd.to_datetime(date) - timedelta(1)).split(' ')[0]

## instantiate webdriver

- check the version of Google Chrome
- download correct version of chromedriver and put in into the same directory

In [2]:
## instantiate driver
driver = webdriver.Chrome()

In [3]:
## get URL https://socialgrep.com/search?query=%2Fr%2FNLP%2Cbefore%3A2023-11-01&order_by=newest
## original reddit url = https://www.reddit.com/r/NLP/

subreddit = 'NLP' # choose by yourself
start_date = '2023-11-01' # choose by yourself
url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cbefore%3A{start_date}&order_by=newest'

driver.get(url)
repeat_time, waiting_time = 3, 2

## scroll to the bottom of the page and wait
for i in range(repeat_time):
    driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
    sleep(waiting_time)

## example of one post

In [4]:
## function to scrape one post
def get_content(post, subreddit):
    try:
        vote = int(post.select_one('span.text-info').text)
    except:
        vote = 0
    try:
        title = post.a.text
    except:
        return None
    try:
        text = post.select_one('div.post_content').get_text(separator='\n').strip()
        if text == '':
            text = None
    except:
        text = None
    date = post.select_one('h6.card-subtitle').text.split(',')[1].strip()

    if text == None and title == f"/r/{subreddit.lower()}":
        return None
    else:
        return {
            "vote" : vote,
            "title" : title,
            "text" : text,
            "date" : date
        }

In [5]:
soup = BeautifulSoup(driver.page_source)
posts = soup.select('div.card-body') # content is under here
get_content(posts[1], subreddit) # show one example

{'vote': 2,
 'title': 'Fast phobia cure audio',
 'text': 'I am looking for a fast phobia cure audio for purchase. could someone point me in the right direction.\n\n\nI can see many demonstrations on YouTube but needed an audio for personal use.\n\n\nalso when looking to purchase on various it seems they deal with specific issue eg fear of spiders etc. I am looking for a generic fast phobia cure audio. \n\n\nthanks',
 'date': '2023-06-08'}

# for loop with datetime

In [6]:
## resume scraping from the last date in the json file
## if the file not exists, create a new empty list : `scraped_data`
if os.path.exists(f'../data/{subreddit}.json'):
    with open(f'../data/{subreddit}.json', 'r') as f: # read scraped json file
        scraped_data = json.load(f)
    start_date = get_day_before(scraped_data[-1]['date'])
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cbefore%3A{start_date}&order_by=newest'
else:
    scraped_data = []

In [7]:
## scrape and append to list `scraped_data`
## RUN THIS CELL AGAIN AND AGAIN until getting the latest post

for _ in tqdm(range(100)): # set repeat time 

    ## scroll to the bottom of the page and wait
    driver.get(url)
    for i in range(3):
        driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
        sleep(1)

    ## get HTML
    soup = BeautifulSoup(driver.page_source)

    ## get posts and iterate each post
    posts = soup.select('div.card-body')
    for post in posts:
        one_post_dict = get_content(post, subreddit)
        if one_post_dict != None:
            scraped_data.append(one_post_dict)

    ## save scraped data to json
    with open(f'../data/{subreddit}.json', 'w', encoding='utf8') as f:
        json.dump(scraped_data, f, indent=False, ensure_ascii=False)

    ## set new date
    start_date = get_day_before(scraped_data[-1]['date'])
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cbefore%3A{start_date}&order_by=newest'


100%|██████████| 100/100 [07:16<00:00,  4.36s/it]


## to dataframe and drop duplicate

In [8]:
## read json file
df = pd.read_json(f'../data/{subreddit}.json').drop_duplicates()
df

Unnamed: 0,vote,title,text,date
0,1,Modelling. Part #4 of the REAL history of NLP,,2023-10-23
1,2,Fast phobia cure audio,I am looking for a fast phobia cure audio for ...,2023-06-08
2,1,HashiCorp Vault + NLP (How To Approach?),[removed],2023-06-08
3,6,New trying to figure out if NLP will help me,I am somewhat familiar with NLP. I had a frien...,2023-06-06
4,1,Discord Server,Anyone know what happened to the NLP Discord s...,2023-06-06
...,...,...,...,...
1926,5,Let's argue: I think NLP is bullshit.,fists up,2010-12-30
1927,10,Neuro-Linguistic Programming: Recommended reading,Below follows a list of books I know have been...,2010-09-24
1928,1,Salad Ltd: A series of videos as a taster to NLP,,2010-09-21
1929,1,Timeline therapy works!,,2010-07-03


In [9]:
## num of missing value in `text`
df.isna().sum()

vote       0
title      0
text     891
date       0
dtype: int64

In [10]:
## check text includes [removed] [deleted]
df[df['text'].isin(['[removed]', '[deleted]'])]

Unnamed: 0,vote,title,text,date
2,1,HashiCorp Vault + NLP (How To Approach?),[removed],2023-06-08
5,1,[OC] 🍷 Wine grouped by tasting notes (using NL...,[deleted],2023-06-05
7,1,Marketplace for datasets,[removed],2023-05-31
8,1,Introducing Interactive Natural Language Proce...,[removed],2023-05-30
11,1,What are my reasons never wanting to use the f...,[removed],2023-05-27
...,...,...,...,...
1911,2,/r/nlp meetup in London anyone?,[deleted],2012-03-10
1912,1,Break up,[deleted],2011-12-12
1917,1,I'm doing a NLP Practitioner course next week....,[removed],2011-09-03
1923,2,Is there NLP in this interview?,[deleted],2011-04-24


In [11]:
## dropped null text and save as `xxx_withtext.json`
df_dropped = df[(df['text'].notna()) & (~df['text'].isin(['[removed]', '[deleted]']))]
df_dropped.to_json(f'../data/{subreddit}_withtext.json', orient='records', indent=True)
df_dropped

Unnamed: 0,vote,title,text,date
1,2,Fast phobia cure audio,I am looking for a fast phobia cure audio for ...,2023-06-08
3,6,New trying to figure out if NLP will help me,I am somewhat familiar with NLP. I had a frien...,2023-06-06
4,1,Discord Server,Anyone know what happened to the NLP Discord s...,2023-06-06
6,5,Techniques for vicarious embarrassment,I am an NLP newbie - my experience is limited ...,2023-06-01
10,2,Is there anyone who is familiar with ACT-R mod...,If you’re familiar with ACT-r please direct me...,2023-05-28
...,...,...,...,...
1919,1,"self-made reframing technique - your critique,...","OK, this is for when you know someone has lied...",2011-08-03
1920,1,Should I subscribe? Too soon?,I have been recently introduced to the world o...,2011-07-12
1922,2,Using Python for Morphological Parsing?,My research project this summer is to attempt ...,2011-05-25
1926,5,Let's argue: I think NLP is bullshit.,fists up,2010-12-30
