# Creating a Stress Detection Tool using Data From Mental Health Subreddits: Data Wrangling

#### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

#reddit crawler
import praw
from praw.models import MoreComments

#preprocessing
import string

#for saving
import pickle

#### Set up client

In [2]:
r = praw.Reddit(user_agent = '',
                client_id = '',
                client_secret = '',
                check_for_async=False)

#### Create the subreddit list that I want to get data from
* I have mostly chosen mental health  related subreddits
* I added in a few positivity-based subreddits to balance out as well

In [3]:
sr_list = ['adhd', 'affirmations', 'anger', 'anxiety', 'depression', 'mentalillness', 'mindfulness', 'socialanxiety']

#### Get URLs from top reddit posts from each subreddit
* I am creating the first for loop to get the top posts from each subreddit from the API
* The second for loop is to get the comments from each of those posts
* I am then appending it to the list, then converting to a dataframe

In [4]:
posts = []

for sr in sr_list:
    subreddit = r.subreddit(sr)
    
    for post in subreddit.hot(limit=100):
        
        post.comments.replace_more(limit=500)
        for comment in post.comments.list():
             posts.append([post.subreddit, comment.body])

In [5]:
posts = pd.DataFrame(posts)

#### Checking out the data
* .shape to make sure I have enough data
* .head and .tail to make sure everything looks right

In [6]:
posts.shape

(9546, 2)

In [7]:
posts.head(5)

Unnamed: 0,0,1
0,ADHD,"I finally cleaned my entire living space, and ..."
1,ADHD,I found this subreddit
2,ADHD,Made it through the first week of spring term!...
3,ADHD,I finally got my Adderall prescription yesterd...
4,ADHD,I finally got my info together to send to my a...


In [8]:
posts.tail(5)

Unnamed: 0,0,1
9541,socialanxiety,"true, how ever,, everyone is living in their o..."
9542,socialanxiety,"You actually don’t know it, you’re just using ..."
9543,socialanxiety,"lol i get this as well. most of the time, they..."
9544,socialanxiety,For me it's more like I'm scared I'll do somet...
9545,socialanxiety,Being able to use logic I think is a way of kn...


#### Fixing the column names

In [9]:
posts.columns = ['subreddit', 'text']

In [10]:
posts.head(5)

Unnamed: 0,subreddit,text
0,ADHD,"I finally cleaned my entire living space, and ..."
1,ADHD,I found this subreddit
2,ADHD,Made it through the first week of spring term!...
3,ADHD,I finally got my Adderall prescription yesterd...
4,ADHD,I finally got my info together to send to my a...


#### Some preprocessing
* I know I want two versions of my text data available, one with capitilization and punctuation and one without so I am adding the preprocessed text to a new column

In [11]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

posts['text_preproc'] = posts['text'].apply(remove_punctuations)

In [12]:
posts['text_preproc'] = posts['text_preproc'].str.lower()

#### Checking things out one more time

In [13]:
posts.head(5)

Unnamed: 0,subreddit,text,text_preproc
0,ADHD,"I finally cleaned my entire living space, and ...",i finally cleaned my entire living space and d...
1,ADHD,I found this subreddit,i found this subreddit
2,ADHD,Made it through the first week of spring term!...,made it through the first week of spring term ...
3,ADHD,I finally got my Adderall prescription yesterd...,i finally got my adderall prescription yesterd...
4,ADHD,I finally got my info together to send to my a...,i finally got my info together to send to my a...


In [14]:
posts.tail(5)

Unnamed: 0,subreddit,text,text_preproc
9541,socialanxiety,"true, how ever,, everyone is living in their o...",true how ever everyone is living in their own ...
9542,socialanxiety,"You actually don’t know it, you’re just using ...",you actually don’t know it you’re just using l...
9543,socialanxiety,"lol i get this as well. most of the time, they...",lol i get this as well most of the time they a...
9544,socialanxiety,For me it's more like I'm scared I'll do somet...,for me its more like im scared ill do somethin...
9545,socialanxiety,Being able to use logic I think is a way of kn...,being able to use logic i think is a way of kn...


#### Rename 'posts' to 'df'

In [15]:
df = posts

#### Save to pickle

In [16]:
df.to_pickle('df.pickle')