In [2]:
import requests
import pandas as pd
import time

In [10]:
url = "https://api.pushshift.io/reddit/search/submission"

In [11]:
def get_posts(subreddit, last = None, size = 100):
    """
    Returns 100 posts by default from the given subreddit.
    If the last created_utc is given, the posts will correspond to the timestaps before that.
    """

    # parameters given to requests
    params = {
        'subreddit': subreddit,
        'size': size,
    }
    
    if last is not None:
        params.update({'before': last})
        
    res = requests.get(url, params)
    

    if res.status_code not in [200]:
        print(f'Status Code: {res.status_code}')
        return None, None
    
    try:
        posts = res.json()['data']
        df = pd.DataFrame(posts).sort_values('created_utc', ascending=False)
        
        # convert the timestamps to datetime
        df['created_at'] = pd.to_datetime(df['created_utc'], 
                                          unit='s', 
                                          origin=pd.Timestamp('1970-01-01'), 
                                          utc=True)
        
        columns = ['subreddit', 'title', 'selftext', 'created_utc', 'created_at', 'domain']
        
        # extract the earliest post from the fetched posts
        last = df.tail(1)['created_utc']
        
        return df[columns], last
    except Exception as e:
        print(e)
        
    return None, None


def get_total_number(subreddit):
    """
    Returns the total number of posts in a subreddit.
    """

    params = {
        'subreddit': subreddit,
        'size': 0,
        'metadata': 'true'
    }

    res = requests.get(url, params)

    if res.status_code not in [200]:
        return 

    n = res.json()['metadata']['total_results']
    
    print(f'Subreddit "{subreddit}" has {n} posts')
    return n


def get_all_posts(subreddit, nmax=None, base_path='.', to_write=True):
    """
    Returns all the posts or nmax number of posts from the given subreddit and 
    save it in the data folder.
    """
    
    path_to_write = f'{base_path}/data/original/{subreddit}.csv'
    last = None

    # gets the total number of subreddits
    n = get_total_number(subreddit)
    
    N = 0
    df_list = []
    
    
    # pick maximum number of posts to fetch to be the minimum of the total possible or the given nmax
    if nmax!=None:
        n = min(n, nmax)

    
    while N < n:
        size = min(100, n-N)
        try:
            df, last = get_posts(subreddit, last=last, size=size)
            if last is not None:
                df_list.append(df)
                N += df.shape[0]
                time.sleep(15)
            else:
                time.sleep(20)
        except Exception as e:
            print(e, N)

            
    DF = pd.concat(df_list).drop_duplicates()
    if to_write:
        print(f'Data from {subreddit} subreddit is saved in {path_to_write}')
        DF.to_csv(path_to_write, index=False)
    return DF

```
subreddits= ['Meditation', 'yoga']
for subreddit in subreddits:
    DF = get_all_posts(subreddit, nmax=4000)
    
    
# Subreddit "Meditation" has 108725 posts
# Data from Meditation subreddit is saved in ./data/original/Meditation.csv
# Subreddit "yoga" has 66865 posts
# Data from yoga subreddit is saved in ./data/original/yoga.csv
```

In [44]:
subreddits = ['Meditation', 'Yoga']
df_list = []

for subreddit in subreddits:
    path = f'../data/original/{subreddit}.csv'
    df_list.append(pd.read_csv(path))

DF = pd.concat(df_list)
print('Original Shape: ', DF.shape)

DF = DF[~(DF['selftext']=='[removed]')]
print('Shape after dropping [removed] posts: ', DF.shape)
DF = DF[~(DF['selftext'].isnull())]
print('Shape after dropping null posts: ', DF.shape)

DF = DF[DF['domain'].str.startswith('self.')]
print('Shape after dropping non-reddit posts: ', DF.shape)

DF = DF[['title', 'selftext', 'created_at', 'subreddit']]

display(DF.groupby('subreddit').count())
min_count = DF.groupby('subreddit').count().min()['title']
print(f'maximum number of rows to select from both: {min_count}')

Original Shape:  (7998, 6)
Shape after dropping [removed] posts:  (6815, 6)
Shape after dropping null posts:  (4147, 6)
Shape after dropping non-reddit posts:  (4112, 6)


Unnamed: 0_level_0,title,selftext,created_at
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Meditation,2632,2632,2632
yoga,1480,1480,1480


maximum number of rows to select from both: 1480


In [93]:
subreddits = ['Meditation', 'Yoga']
df_list = []

for subreddit in subreddits:
    path = f'../data/original/{subreddit}.csv'
    df = pd.read_csv(path)
    df = df[~(df['selftext']=='[removed]')]
    df = df[~(df['selftext'].isnull())]
    df = df[df['domain'].str.startswith('self.')]
    df = df.sample(n=min_count, replace=False)
    df_list.append(df)

DF = pd.concat(df_list).drop_duplicates()
DF['text'] = DF[['title', 'selftext']].apply(lambda r: ' '.join(r), axis= 1)
DF['clean_text'] = DF['text'].str.replace('_', ' ')
print('Shape: ', DF.shape)
DF.reset_index(inplace=True)


DF = DF[['title', 'selftext', 'text', 'clean_text', 'created_at', 'subreddit']]

DF.groupby('subreddit').count()

path = f"../data/original/{'_'.join(subreddits)}.csv"
print(path)
DF.to_csv(path, index=False)

Shape:  (2960, 8)
../data/original/Meditation_Yoga.csv


In [94]:
DF.head()

Unnamed: 0,title,selftext,text,clean_text,created_at,subreddit
0,Hey! Does anyone know any guided meditations t...,***1) Look***\n\nBecome aware of what you see:...,Hey! Does anyone know any guided meditations t...,Hey! Does anyone know any guided meditations t...,2021-09-03 20:43:02+00:00,Meditation
1,Large Tuning forks,Went over my buddy's house the other day and h...,Large Tuning forks Went over my buddy's house ...,Large Tuning forks Went over my buddy's house ...,2021-08-10 13:24:16+00:00,Meditation
2,Discomfort in early days,"Hello, beginner meditator here. I'm three days...","Discomfort in early days Hello, beginner medit...","Discomfort in early days Hello, beginner medit...",2021-07-28 21:54:56+00:00,Meditation
3,Just meditated for 10 minutes for the first time,"Today was the first time I meditated, I knew m...",Just meditated for 10 minutes for the first ti...,Just meditated for 10 minutes for the first ti...,2021-08-31 21:58:45+00:00,Meditation
4,Is there a science of visualisation of overcom...,I have two questions. \n\nSo a small bit of ba...,Is there a science of visualisation of overcom...,Is there a science of visualisation of overcom...,2021-08-08 10:08:16+00:00,Meditation


https://stackoverflow.com/questions/54761797/how-to-preserve-hashtag-and-mention-characterizers-from-countvectorizer-token

In [91]:
# get hashtags
DF['text'].str.findall(r'#.*?(?=\s|$)').sort_values()
#DF['text'].str.findall(r'#.*?(?=\s|$)')

0                                                      []
1941                                                   []
1942                                                   []
1943                                                   []
1945                                                   []
                              ...                        
756     [#x200B;, #x200B;, #x200B;, #x200B;, #x200B;, ...
2394                                              [#yoga]
1949                                        [#yoganewbie]
2209    [#yogateachertraining, #yoga, #yogateacher, #y...
2693    [#yoga’, #RigVeda](https://www.facebook.com/ha...
Name: text, Length: 2960, dtype: object