In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
class RateLimiter:
    def __init__(self, frequency):
        self.frequency = frequency
        self.last_time = None
    def get(self, url:str) -> requests.models.Response:
        if self.last_time is not None:
            time.sleep(self.frequency - (time.time() - self.last_time))
        response = requests.get(url)
        self.last_time = time.time()
        return response

In [None]:
def get_load_url(soup):
    fpps = soup.find_all('faceplate-partial')
    for fpp in fpps:
        a = fpp.attrs
        try:
            if 'partial-more-posts' in a['id']:
                return fpp.attrs['src']
        except KeyError:
            pass
    raise Exception('No loader found!')

def get_content(link:str, r:RateLimiter) -> str:
    page = r.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup.find('div', class_='text-neutral-content').text.strip()

def extract_data(entry, r:RateLimiter):
    a = entry.attrs
    link = a['content-href']
    return {'idx': a['feedindex'],
            'title': a['post-title'],
            'link': link,
            'votes': a['score'],
            'comments': a['comment-count'],
            'created': datetime.fromisoformat(a['created-timestamp']),
            'content': get_content(link, r)}

def get_posts(sub: str, calls: int = 1):
    url = f'https://www.reddit.com/r/{sub}/?feedViewType=compactView'
    r = RateLimiter(1)
    print(f'making request 1/{calls}')
    page = r.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    posts = soup.find_all('shreddit-post')
    out = []
    for post in tqdm(posts, desc='extracting posts', leave=False, unit='post'):
        out.append(extract_data(post, r))
    for i in range(1, calls):
        print(f'making request {i+1}/{calls}')
        l = get_load_url(soup)
        page = r.get(f'https://www.reddit.com{l}')
        soup = BeautifulSoup(page.content, 'html.parser')
        posts = soup.find_all('shreddit-post')
        for post in tqdm(posts, desc='extracting posts', leave=False, unit='post'):
            out.append(extract_data(post, r))
    return pd.DataFrame(out)

In [None]:
# Set RECOVER to True if you already have scraped data from here as CSV
# Set RECOVER to False if you want to scrape new data

RECOVER = True
if RECOVER:
    df = pd.read_csv('r-autism.csv')
else:
    df = get_posts('autism', 10)
    df.to_csv('r-autism.csv', index=False)
df

In [None]:
sia = SentimentIntensityAnalyzer()

df['title_sentiment'] = df['title'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['content_sentiment'] = df['content'].apply(lambda x: sia.polarity_scores(x)['compound'])
df

In [None]:
wc = WordCloud(stopwords=STOPWORDS).generate(' '.join(df['content']))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
plt.scatter(df[1:]['comments'], df[1:]['title_sentiment'])
plt.show()