In [1]:
import praw
from os import path
import wordcloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
import csv
import requests
import json

In [2]:
fieldsForAuthor = ["author", "created_utc"]
fields = ["author", "brand_safe", "contest_mode", "created_utc", "full_link", "id",
              "is_self", "num_comments", "over_18", "retrieved_on", "score", "selftext",
              "subreddit", "subreddit_id", "title"]
fieldsForCommonAuthor = ["author", "generalIssues_created_utc", "suicideWatch_created_utc"]


In [3]:
def wordCloud(filename):
    print("Creating Word Cloud for Mental Health")
    mentalhealth = open(filename, "r", encoding="utf-8").read()
    mentalhealthImage = np.array(Image.open("mentalhealth.png"))
    stopwords = set(wordcloud.STOPWORDS)
    stopwords.add("english")  # to get rid of the most common words like "the", "it", "of" etc
    wc = wordcloud.WordCloud(background_color="white", max_words=2000, mask=mentalhealthImage, max_font_size=40,
                             stopwords=stopwords, random_state=42)
    wc.generate(mentalhealth)
    print("Done generating words!")
    mental_health_colors = wordcloud.ImageColorGenerator(mentalhealthImage)
    plt.imshow(wc.recolor(color_func=mental_health_colors), interpolation="bilinear")
    plt.axis("off")
    print("Created Word Cloud for Mental Health")
    # plt.figure()
    plt.show()



In [4]:
def wordCloud2(filename):
    print("Creating Word Cloud for Suicide Watch")
    suicidewatch = open(filename, "r", encoding="utf-8").read()
    suicidewatchImage = np.array(Image.open( "suicide.png"))
    stopwords = set(wordcloud.STOPWORDS)
    stopwords.add("english")  # to get rid of the most common words like "the", "it", "of" etc
    wc = wordcloud.WordCloud(background_color="white", max_words=2000, mask=suicidewatchImage, max_font_size=40,
                             stopwords=stopwords, random_state=42)
    wc.generate(suicidewatch)
    suicide_colors = wordcloud.ImageColorGenerator(suicidewatchImage)
    plt.imshow(wc.recolor(color_func=suicide_colors), interpolation="bilinear")
    plt.axis("off")
    print("Created Word Cloud for Suicide Watch")
    # plt.figure()
    plt.show()

In [19]:
def extractMentalHealthCSV(start, end):
    with open('generalIssuesTS2.csv', mode='a',encoding="utf-8") as fileObject:
        csvWriter = csv.writer(fileObject, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvWriter.writerow(fields)
        for delta in range(1, 61):
            start += datetime.timedelta(days=1)
            end += datetime.timedelta(days=1)
            epoch1 = int(time.mktime(start.timetuple()))
            epoch2 = int(time.mktime(end.timetuple()))
            mentalHealth = "https://api.pushshift.io/reddit/search/submission/?after={0}&before={1}&size={2}&subreddit={3}".format(
                epoch1, epoch2, '1000', 'mentalhealth')
            generalIssues = "https://api.pushshift.io/reddit/search/submission/?after={0}&before={1}&size={2}&subreddit={3}".format(
                epoch1, epoch2, '1000',
                'mentalhealth,depression,traumatoolbox,bipolarreddit,BPD,ptsd,psychoticreddit,EatingDisorders,StopSelfHarm,survivorsofabuse,rapecounseling,hardshipmates,panicparty,socialanxiety')
            suicideWatch = "https://api.pushshift.io/reddit/search/submission/?after={0}&before={1}&size={2}&subreddit={3}".format(
                epoch1, epoch2, '1000', 'suicidewatch')
            url = "https://api.pushshift.io/reddit/search/submission/?after=1489208400&before=1502424000&size=40000&subreddit=mentalhealth"

            data = requests.get(generalIssues)
            data = data.json()
            count = 0
            for singlePost in data["data"]:
                row = []
                for field in fields:
                    row.append(singlePost.get(field, None))
                count += 1
                csvWriter.writerow(row)
            print(start, end, count)


In [6]:
def extractAuthorsWithTimestamp(fromFile, toFile):
    print("Extrating Authors from: ", fromFile, "to: ", toFile)
    tempSet = set()
    with open(fromFile, mode='r',encoding="utf-8") as fileReader:
        fileReader.readline()
        csvReader = csv.reader(fileReader, delimiter=',')
        for row in csvReader:
            if len(row)>=1:
                if row[0] != "[deleted]":
                    tempSet.add(row[0])
    with open(toFile, mode='w',encoding="utf-8") as fileWriter:
        csvWriter = csv.writer(fileWriter, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvWriter.writerow([fieldsForAuthor[0]])
        for elem in tempSet:
            csvWriter.writerow([elem])
    print("Done Extracting!")

In [7]:
def extractMHandSWcommonAuthors(generalIssuesFilename, suicideWatchFilename, commonAuthorsFilename):
    print("Extracting Common Authors between: ", generalIssuesFilename, "and: ", suicideWatchFilename)
    swSet = set()
    with open(suicideWatchFilename, mode='r') as swReader:
        swReader.readline()
        csvSWReader = csv.reader(swReader, delimiter=',')
        for row in csvSWReader:
            if len(row)>=1:
                swSet.add(row[0])

    giSet = set()
    with open(generalIssuesFilename, mode='r') as giReader:
        giReader.readline()
        csvGIReader = csv.reader(giReader, delimiter=',')
        for row in csvGIReader:
            if len(row)>=1:
                giSet.add(row[0])

    common = swSet & giSet

    with open(commonAuthorsFilename, mode='w') as commonWriter:
        csvWriter = csv.writer(commonWriter, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvWriter.writerow([fieldsForCommonAuthor[0]])
        for elem in common:
            csvWriter.writerow([elem])
    print("Done Extracting")


In [8]:
def extractAllDataForCommonAuthors(postsFilename, commonAuthorsFilename, commonPostsFilename):
    commonAuthors = set()
    with open(commonAuthorsFilename, mode='r',encoding='utf-8') as commonReader:
        commonReader.readline()
        csvReader = csv.reader(commonReader, delimiter=',')
        for row in csvReader:
            if len(row)>=1:
                commonAuthors.add(row[0])

    with open(commonPostsFilename, mode='w',encoding='utf-8') as commonWriter:
        csvWriter = csv.writer(commonWriter, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvWriter.writerow(fields)
        with open(postsFilename, mode='r',encoding='utf-8') as commonReader:
            commonReader.readline()
            csvReader = csv.reader(commonReader, delimiter=',')
            for row in csvReader:
                if len(row)>=1:
                    if row[0] in commonAuthors:
                        csvWriter.writerow(row)

    print(len(commonAuthors))


In [15]:
def main():
    reddit = praw.Reddit(client_id='efSXI-Jfy0KzYg', client_secret='mcqrfuhJgZjwowavwLkl_uoT3mxPUA',
                          user_agent='dataset', username='mohit_mittal', password='major_project')

    #wordCloud('mentalHealth.txt')
    #wordCloud2('suicidewatch.txt')
    mentalHealthSubreddit = reddit.subreddit('generalIssues')
    ##TS1
    #start = datetime.date(2016, 2, 11)
    #end = datetime.date(2016, 2, 12)
    #loop from 1 to 61, then append below
    #TS1
    #start = datetime.date(2016, 4, 13)
    #end = datetime.date(2016, 4, 14)
    #loop from 1 to 61, then append below
    #TS1
    #start = datetime.date(2016, 6, 13)
    #end = datetime.date(2016, 6, 14)
    #loop from 1 to 61, then append below
    #TS1
    start = datetime.date(2016, 8, 13)
    end = datetime.date(2016, 8, 14)
    extractMentalHealthCSV(start, end)
    pass

if __name__ == '__main__':
    main()

2016-08-14 2016-08-15 100
2016-08-15 2016-08-16 100
2016-08-16 2016-08-17 100
2016-08-17 2016-08-18 100
2016-08-18 2016-08-19 100
2016-08-19 2016-08-20 100
2016-08-20 2016-08-21 100
2016-08-21 2016-08-22 100
2016-08-22 2016-08-23 100
2016-08-23 2016-08-24 100
2016-08-24 2016-08-25 100
2016-08-25 2016-08-26 100
2016-08-26 2016-08-27 100
2016-08-27 2016-08-28 100
2016-08-28 2016-08-29 100
2016-08-29 2016-08-30 100
2016-08-30 2016-08-31 100
2016-08-31 2016-09-01 100
2016-09-01 2016-09-02 100
2016-09-02 2016-09-03 100
2016-09-03 2016-09-04 100
2016-09-04 2016-09-05 100
2016-09-05 2016-09-06 100
2016-09-06 2016-09-07 100
2016-09-07 2016-09-08 100
2016-09-08 2016-09-09 100
2016-09-09 2016-09-10 100
2016-09-10 2016-09-11 100
2016-09-11 2016-09-12 100
2016-09-12 2016-09-13 100
2016-09-13 2016-09-14 100
2016-09-14 2016-09-15 100
2016-09-15 2016-09-16 100
2016-09-16 2016-09-17 100
2016-09-17 2016-09-18 100
2016-09-18 2016-09-19 100
2016-09-19 2016-09-20 100
2016-09-20 2016-09-21 100
2016-09-21 2

In [20]:
#TS2
reddit = praw.Reddit(client_id='efSXI-Jfy0KzYg', client_secret='mcqrfuhJgZjwowavwLkl_uoT3mxPUA',
                          user_agent='dataset', username='mohit_mittal', password='major_project')
mentalHealthSubreddit = reddit.subreddit('generalIssues')
#start = datetime.date(2016, 10, 12)
#end = datetime.date(2016, 10, 13)
start = datetime.date(2016, 12, 12)
end = datetime.date(2016, 12, 13)
extractMentalHealthCSV(start, end)


2016-12-13 2016-12-14 100
2016-12-14 2016-12-15 100
2016-12-15 2016-12-16 100
2016-12-16 2016-12-17 100
2016-12-17 2016-12-18 100
2016-12-18 2016-12-19 100
2016-12-19 2016-12-20 100
2016-12-20 2016-12-21 100
2016-12-21 2016-12-22 100
2016-12-22 2016-12-23 100
2016-12-23 2016-12-24 100
2016-12-24 2016-12-25 100
2016-12-25 2016-12-26 100
2016-12-26 2016-12-27 100
2016-12-27 2016-12-28 100
2016-12-28 2016-12-29 100
2016-12-29 2016-12-30 100
2016-12-30 2016-12-31 100
2016-12-31 2017-01-01 100
2017-01-01 2017-01-02 100
2017-01-02 2017-01-03 100
2017-01-03 2017-01-04 100
2017-01-04 2017-01-05 100
2017-01-05 2017-01-06 100
2017-01-06 2017-01-07 100
2017-01-07 2017-01-08 100
2017-01-08 2017-01-09 100
2017-01-09 2017-01-10 100
2017-01-10 2017-01-11 100
2017-01-11 2017-01-12 100
2017-01-12 2017-01-13 100
2017-01-13 2017-01-14 100
2017-01-14 2017-01-15 100
2017-01-15 2017-01-16 100
2017-01-16 2017-01-17 100
2017-01-17 2017-01-18 100
2017-01-18 2017-01-19 100
2017-01-19 2017-01-20 100
2017-01-20 2

In [21]:
#extractAuthorsWithTimestamp('mentalHealthTS1.csv', 'mentalHealthAuthorsTS1.csv')
#extractAuthorsWithTimestamp('mentalHealthTS2.csv', 'mentalHealthAuthorsTS2.csv')
extractAuthorsWithTimestamp('suicideWatchTS1.csv', 'suicideWatchAuthorsTS1.csv')
extractAuthorsWithTimestamp('suicideWatchTS2.csv', 'suicideWatchAuthorsTS2.csv')


Extrating Authors from:  suicideWatchTS1.csv to:  suicideWatchAuthorsTS1.csv
Done Extracting!
Extrating Authors from:  suicideWatchTS2.csv to:  suicideWatchAuthorsTS2.csv
Done Extracting!


In [22]:
extractAuthorsWithTimestamp('generalIssuesTS1.csv', 'generalIssuesAuthorsTS1.csv')
extractAuthorsWithTimestamp('generalIssuesTS2.csv', 'generalIssuesAuthorsTS2.csv')

Extrating Authors from:  generalIssuesTS1.csv to:  generalIssuesAuthorsTS1.csv
Done Extracting!
Extrating Authors from:  generalIssuesTS2.csv to:  generalIssuesAuthorsTS2.csv
Done Extracting!


In [23]:
#extractMHandSWcommonAuthors('mentalHealthAuthorsTS1.csv', 'suicideWatchAuthorsTS1.csv', 'mhTS1swTS1Authors.csv')
#extractMHandSWcommonAuthors('mentalHealthAuthorsTS2.csv', 'suicideWatchAuthorsTS2.csv', 'mhTS2swTS2Authors.csv')
extractMHandSWcommonAuthors('generalIssuesAuthorsTS1.csv', 'suicideWatchAuthorsTS1.csv', 'giTS1swTS1Authors.csv')
#extractMHandSWcommonAuthors('generalIssuesAuthorsTS1.csv', 'suicideWatchAuthorsTS2.csv', 'giTS1swTS2Authors.csv')
extractMHandSWcommonAuthors('generalIssuesAuthorsTS2.csv', 'suicideWatchAuthorsTS2.csv', 'giTS2swTS2Authors.csv')
#extractMHandSWcommonAuthors('generalIssuesAuthorsTS2.csv', 'suicideWatchAuthorsTS1.csv', 'giTS2swTS1Authors.csv')


Extracting Common Authors between:  generalIssuesAuthorsTS1.csv and:  suicideWatchAuthorsTS1.csv
Done Extracting
Extracting Common Authors between:  generalIssuesAuthorsTS2.csv and:  suicideWatchAuthorsTS2.csv
Done Extracting


In [24]:
extractAllDataForCommonAuthors('generalIssuesTS1.csv', 'giTS1swTS1Authors.csv', 'generalIssuesCommonTS1.csv')
extractAllDataForCommonAuthors('generalIssuesTS2.csv', 'giTS2swTS2Authors.csv', 'generalIssuesCommonTS2.csv')
#extractAllDataForCommonAuthors('generalIssuesTS1.csv', 'giTS1swTS2Authors.csv', 'generalIssuesCommonTS1toTS2.csv')
#extractAllDataForCommonAuthors('generalIssuesTS2.csv', 'giTS2swTS1Authors.csv', 'generalIssuesCommonTS2toTS1.csv')
#extractAllDataForCommonAuthors('generalIssuesTS1.csv', 'mhTS1swTS1Authors.csv', 'allCommonTS1.csv')
#extractAllDataForCommonAuthors('generalIssuesTS2.csv', 'mhTS2swTS2Authors.csv', 'allCommonTS2.csv')
    

869
385
