## import

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from time import sleep
import json, os


## instantiate webdriver

In [2]:
## instantiate driver
## check the version of Google Chrome and download correct version of chromedriver
driver = webdriver.Chrome()

In [3]:
## get page of "social grep", which gived old posts of subreddit
## original reddit url = 'https://www.reddit.com/r/xxxxxxxxx/'

subreddit = 'languagetechnology' # choose by yourself
start_date = '2010-01-01' # choose by yourself
url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{start_date}&order_by=oldest'

driver.get(url)
repeat_time, waiting_time = 4, 2

## scroll to the bottom of the page and wait
for i in range(repeat_time):
    driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
    sleep(waiting_time)

## example of one post

In [4]:
## function to scrape
def get_content(post, subreddit):
    try:
        vote = int(post.select_one('span.text-info').text)
    except:
        vote = 0
    try:
        title = post.a.text
    except:
        return None
    try:
        text = post.select_one('div.post_content').get_text(separator='\n').strip()
        if text == '':
            text = None
    except:
        text = None
    date = post.select_one('h6.card-subtitle').text.split(',')[1].strip()

    if text == None and title == f"/r/{subreddit.lower()}":
        return None
    else:
        return {
            "vote" : vote,
            "title" : title,
            "text" : text,
            "date" : date
        }

In [5]:
soup = BeautifulSoup(driver.page_source)
posts = soup.select('div.card-body') # content is under here

get_content(posts[1], subreddit) # show one example

{'vote': 1, 'title': 'Tokenization', 'text': None, 'date': '2010-03-10'}

# for loop with datetime

In [6]:
if os.path.exists(f'{subreddit}.json'):
    ## resume scraping from the last date in the json file
    with open(f'{subreddit}.json', 'r') as f:
        scraped_data = json.load(f)
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'
else:
    ## if the file not exists, create a new list
    scraped_data = []

In [7]:
## scrape and append to `scraped_data`
## RUN THIS CELL AGAIN AND AGAIN until getting the latest post

for _ in tqdm(range(10)): # set repeat time 

    ## scroll to the bottom of the page and wait
    driver.get(url)
    for i in range(4):
        driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
        sleep(2)

    ## get HTML
    soup = BeautifulSoup(driver.page_source)
    posts = soup.select('div.card-body')

    ## iterate each post
    for post in posts:
        one_post_dict = get_content(post, subreddit)
        if one_post_dict != None:
            scraped_data.append(one_post_dict)

    ## save to json
    with open(f'{subreddit}.json', 'w') as f:
        json.dump(scraped_data, f, indent=False, ensure_ascii=False)

    ## set new date
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'


100%|██████████| 10/10 [02:31<00:00, 15.14s/it]


## to dataframe and drop duplicate

In [8]:
df = pd.read_json(f'{subreddit}.json').drop_duplicates()
df

Unnamed: 0,vote,title,text,date
0,1,Example based machine translation,,2010-03-10
1,1,Tokenization,,2010-03-10
2,1,Maximum Entropy,,2010-03-13
3,1,Parts-Of-Speech Tagging,,2010-03-13
4,1,Interlingual Machine Translation,,2010-10-07
...,...,...,...,...
6971,5,Sentiment analysis using the Sine Waves,"I have created a sentiment analysis, which all...",2023-04-04
6972,2,Integrate NLP into a web-application,My company has a SaaS-based web app. We intend...,2023-04-04
6973,6,Title: AI voice bot app: Impersonating Andrew ...,[removed],2023-04-04
6974,2,Mitigate the context size limit in LLMs with s...,The following idea seems like an obvious thing...,2023-04-04


In [9]:
## missing value in text
df.isna().sum()

vote       0
title      0
text     735
date       0
dtype: int64

In [10]:
## text includes [removed] [deleted]
df[df['text'].isin(['[removed]', '[deleted]'])]

Unnamed: 0,vote,title,text,date
9,1,Looking for good paraphrase corpus (not Micros...,[removed],2011-12-06
14,1,Looking for a comparison of probabilistic lang...,[removed],2012-02-14
16,1,If you own or know Jurafsky and Martin's book ...,[removed],2012-04-21
19,1,SemGraph - library for reading and visualising...,[removed],2012-06-15
26,2,Clustering Jeopardy! questions together with P...,[deleted],2012-11-12
...,...,...,...,...
6948,1,Starting a career in Speech AI,[deleted],2023-03-25
6958,0,Learning prompt engineering for someone non-te...,[deleted],2023-03-30
6966,13,"How does ""next word prediction"" sample the nex...",[deleted],2023-04-01
6970,1,LANGUAGE LEARNERS: How is it going!,[deleted],2023-04-03
