In [None]:
!pip install nltk
!pip install scikit-learn
!pip install pandas

In [None]:
#imports
import nltk
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.stats import ks_2samp


In [None]:
#useful methods

#given the filename of a text file containing a line seperated list of diagnosed users and the filename of a similar file containing undiagnosed users,
#returns two lists for diagnosed and undiagnosed users respectively
def get_all_users(diagnosed_filename, undiagnosed_filename):
    diagnosed_users = []
    undiagnosed_users = []
    
    diagnosedfile = open(diagnosed_filename,'r')
    for user in diagnosedfile:
        diagnosed_users.append(user.strip())
   
    undiagnosedfile = open(undiagnosed_filename, 'r')
    for user in undiagnosedfile:
        undiagnosed_users.append(user.strip())
        
    return diagnosed_users, undiagnosed_users

#takes two files of the format: Username,Stemmed_data
#where the first column is the username of each user and the second column contains the stemmed version of the concatenation of all posts made by that user
#returns dictionaries containing the stemmed data for both diagnosed users and undiagnosed users respectively
def get_stemmed_data(diagnosed_filename, undiagnosed_filename, separator):
    dfilename = 'dstemmed_' + extension + '.txt'
    ufilename = 'ustemmed_' + extension + '.txt'
    
    ddict = {}
    udict = {}
    
    dfile = open(dfilename,'r')
    ufile = open(ufilename,'r')
    
    for line in dfile:
        line_list = line.split(separator)
        if len(line_list) != 2:
            print('error')
            break
        ddict[line_list[0]] = line_list[1].strip() 
    
    for line in ufile:
        line_list = line.split(separator)
        if len(line_list) != 2:
            print('error')
            break
        udict[line_list[0]] = line_list[1].strip() 
    
    dfile.close()
    ufile.close()
    
    return ddict, udict

#given two dictionaries representing data from two separate classes of users: {diagnosed,undiagnosed} -> {1,0} creates and returns a dataframe of that data
def prepare_dataframe(ddict, udict):
    undiagnosed_frame1 = pd.DataFrame(udict, index = [0])
    utframe1 = undiagnosed_frame1.T
    uclass = [0] * len(utframe1)
    utframe1[1] = uclass

    diagnosed_frame1 = pd.DataFrame(ddict, index = [0])
    dtframe1 = diagnosed_frame1.T
    dclass = [1] * len(dtframe1)
    dtframe1[1] = dclass
    
    totalframe = dtframe1.append(utframe1)
    #print(f'dframe size = {dtframe1.size}')
    #print(f'utframe size = {utframe1.size}')
    #print(f'totalframe size = {totalframe.size}')
    return totalframe

#returns diagnosed and undiagnosed dataframes separately
def prepare_split_dataframes(ddict, udict):
    undiagnosed_frame1 = pd.DataFrame(udict, index = [0])
    utframe1 = undiagnosed_frame1.T
    uclass = [0] * len(utframe1)
    utframe1[1] = uclass

    diagnosed_frame1 = pd.DataFrame(ddict, index = [0])
    dtframe1 = diagnosed_frame1.T
    dclass = [1] * len(dtframe1)
    dtframe1[1] = dclass
    
    return dtframe1, utframe1

#fills lists of subreddits separated by category and stored in separate text files
def get_subreddit_lists():
    filelist = ['general_discussion.txt','hobbies.txt','topical_discussion.txt','mental_health_and_support.txt','physical_health_and_wellness.txt']
    keylist = ['General', 'Hobbies', 'Topical Discussion', 'Mental Health and Support', 'Physical Health and Wellness']
    finaldict = {}
    exclusive_list = []
    all_list = []

    for index in range(len(filelist)):
        finaldict[keylist[index]] = []
        tempfile = open(filelist[index],'r')

        for line in tempfile:
            finaldict[keylist[index]].append(line.strip())
            all_list.append(line.strip())
            if not keylist[index] == 'Mental Health and Support':
                exclusive_list.append(line.strip())
        tempfile.close()
    general_list = finaldict['General']
    hobbies_list = finaldict['Hobbies']
    topical_list = finaldict['Topical Discussion']
    mental_list = finaldict['Mental Health and Support']
    physical_list = finaldict['Physical Health and Wellness']
    
    return all_list, exclusive_list, general_list, hobbies_list, topical_list, mental_list, physical_list

In [None]:
#fill data structures

filelist = ['general_discussion.txt','hobbies.txt','topical_discussion.txt','mental_health_and_support.txt','physical_health_and_wellness.txt']
keylist = ['General', 'Hobbies', 'Topical Discussion', 'Mental Health and Support', 'Physical Health and Wellness']
finaldict = {}
exclusive_list = []
all_list = []

for index in range(len(filelist)):
    finaldict[keylist[index]] = []
    tempfile = open(filelist[index],'r')
   
    for line in tempfile:
        finaldict[keylist[index]].append(line.strip())
        all_list.append(line.strip())
        if not keylist[index] == 'Mental Health and Support':
            exclusive_list.append(line.strip())
    tempfile.close()
general_list = finaldict['General']
hobbies_list = finaldict['Hobbies']
topical_list = finaldict['Topical Discussion']
mental_list = finaldict['Mental Health and Support']
physical_list = finaldict['Physical Health and Wellness']



In [None]:
print(f'General: {len(general_list)}')
print(f'Hobbies: {len(hobbies_list)}')
print(f'Topical: {len(topical_list)}')
print(f'Mental: {len(mental_list)}')
print(f'Physical: {len(physical_list)}')
print(f'Exclusive: {len(exclusive_list)}')




In [None]:
def get_totals(disorder, include_list, option):
    anxiety_members, unanxiety_members = get_all_users()
    
    if disorder == 'anxiety':
        anxiety_members, unanxiety_members = get_anxiety_users()
    elif disorder == 'depression':
        anxiety_members, unanxiety_members = get_depression_users()
  
    dpostnums = 0
    upostnums = 0
    usubmissionnums = 0
    ucommentnums = 0
    dsubmissionnums = 0
    dcommentnums = 0
    
    for user in anxiety_members:
        subfilename = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'
        
        subfile = open(subfilename, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            dpostnums += 1
            dsubmissionnums += 1
        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            dpostnums+=1
            dcommentnums +=1
        comfile.close()

    for user in unanxiety_members:
        
        subfilename = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        post_num = 0
        
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            upostnums += 1
            usubmissionnums +=1
           
        subfile.close()

        comfilename = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            upostnums += 1
            ucommentnums += 1
            
        comfile.close()
    if option == 'comments':
        return dcommentnums, ucommentnums
    elif option == 'submissions':
        return  dsubmissionnums, usubmissionnums
    else:
        return dpostnums, upostnums

def total_helper():
    dcount, ucount = get_totals('',all_list, '')
    print(f'Diagnosed posts: {dcount}\nUndiagnosed posts: {ucount}')
    dcount, ucount = get_totals('',all_list, 'comments')
    print(f'Diagnosed comments: {dcount}\nUndiagnosed comments: {ucount}')
    dcount, ucount = get_totals('',all_list, 'submissions')
    print(f'Diagnosed submissions: {dcount}\nUndiagnosed submissions: {ucount}')
    dcount, ucount = get_totals('depression',all_list, '')
    print(f'Depressed posts: {dcount}\nUndepressed posts: {ucount}')
    dcount, ucount = get_totals('depression',all_list, 'comments')
    print(f'Depressed comments: {dcount}\nUndepressed comments: {ucount}')
    dcount, ucount = get_totals('depression',all_list, 'submissions')
    print(f'Depressed submissions: {dcount}\nUndepressed submissions: {ucount}')
    
    dcount, ucount = get_totals('anxiety',all_list, '')
    print(f'Anxiety posts: {dcount}\nAnxiety posts: {ucount}')
    dcount, ucount = get_totals('anxiety',all_list, 'comments')
    print(f'Anxiety comments: {dcount}\nAnxiety comments: {ucount}')
    dcount, ucount = get_totals('anxiety',all_list, 'submissions')
    print(f'Anxiety submissions: {dcount}\nAnxiety submissions: {ucount}')
    
def total_helper_posts(pop):
    dcount, ucount = get_totals(pop, exclusive_list, '')
    print(f'Exclusive Diagnosed posts: {dcount}\nExclusive Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, mental_list, '')
    print(f'Mental Diagnosed posts: {dcount}\nMental Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, physical_list, '')
    print(f'Physical Diagnosed posts: {dcount}\nPhysical Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, hobbies_list, '')
    print(f'Hobbies Diagnosed posts: {dcount}\nHobbies Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, general_list, '')
    print(f'General Diagnosed posts: {dcount}\nGeneral Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, topical_list, '')
    print(f'Topical Diagnosed posts: {dcount}\nTopical Undiagnosed posts: {ucount}')
    
def total_helper_comments(pop):
    dcount, ucount = get_totals(pop, exclusive_list, 'comments')
    print(f'Exclusive Diagnosed posts: {dcount}\nExclusive Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, mental_list, 'comments')
    print(f'Mental Diagnosed posts: {dcount}\nMental Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, physical_list, 'comments')
    print(f'Physical Diagnosed posts: {dcount}\nPhysical Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, hobbies_list, 'comments')
    print(f'Hobbies Diagnosed posts: {dcount}\nHobbies Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, general_list, 'comments')
    print(f'General Diagnosed posts: {dcount}\nGeneral Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, topical_list, 'comments')
    print(f'Topical Diagnosed posts: {dcount}\nTopical Undiagnosed posts: {ucount}')
    
def total_helper_subsmissions(pop):
    dcount, ucount = get_totals(pop, exclusive_list, 'submissions')
    print(f'Exclusive Diagnosed posts: {dcount}\nExclusive Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, mental_list, 'submissions')
    print(f'Mental Diagnosed posts: {dcount}\nMental Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, physical_list, 'submissions')
    print(f'Physical Diagnosed posts: {dcount}\nPhysical Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, hobbies_list, 'submissions')
    print(f'Hobbies Diagnosed posts: {dcount}\nHobbies Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, general_list, 'submissions')
    print(f'General Diagnosed posts: {dcount}\nGeneral Undiagnosed posts: {ucount}')
    dcount, ucount = get_totals(pop, topical_list, 'submissions')
    print(f'Topical Diagnosed posts: {dcount}\nTopical Undiagnosed posts: {ucount}')
    

In [None]:
danxietyposts, uanxietyposts = get_totals('anxiety', all_list, '')
print(danxietyposts)
print(uanxietyposts)

danxietyposts, uanxietyposts = get_totals('depression', all_list, '')
print(danxietyposts)
print(uanxietyposts)

danxietyposts, uanxietyposts = get_totals('', all_list, '')
print(danxietyposts)
print(uanxietyposts)

In [None]:
total_helper_subsmissions('depression')

In [None]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [None]:
#ddict1, udict1 = get_stemmed_data('all')
#ddict1, udict1 = get_stemmed_data_depression('all')
ddict1, udict1 = get_stemmed_data_depression('all')

dtop20 = get_top_n_words(ddict1.values(), 20)
utop20 = get_top_n_words(udict1.values(), 20)

print('Diagnosed top words')
print(dtop20)
print('Undiagnosed top words')
print(utop20)

#dframe1, uframe1 = prepare_split_dataframes(ddict1, udict1)


In [None]:
def subs_per_category(filename):
    subreddit_categories = {}
    tempfile = open(filename,'r')
    for line in tempfile:
        linelist = line.split(';')
        if len(linelist) == 2:
            category = linelist[1]
            subreddit = linelist[0]
            categories = category.split(' ')
            for cate in categories:
                cat = cate.strip()
                if cat in subreddit_categories:
                    subreddit_categories[cat].append(subreddit)
                else:
                    subreddit_categories[cat] = [subreddit]
    tempfile.close()
    return subreddit_categories

def print_subs_in_categories(subcats):
    tempfile = open('subs_by_category.txt', 'w')
    for cat in subcats:
        tempfile.write(cat.strip())
        tempfile.write('\n\n')
        sublist = subcats[cat]
        for sub in sublist:
            tempfile.write(sub)
            tempfile.write('\n')
        tempfile.write('\n')
    tempfile.close()

In [None]:
sub_categories = subs_per_category('dcategories.txt')
print_subs_in_categories(sub_categories)

In [None]:
#retrieves all uncategorized subreddits and writes them to a file
sub_categories= []
sub_counts = {}
catfile = open('dcategories.txt', 'r')
for line in catfile:
    linelist = line.split(';')
    sub_categories.append(linelist[0])
catfile.close()
print(len(sub_categories))
print(sub_categories[1])

anxiety_files = []
anxiety_members = []
anxietyfile = open('proven_all_diagnosed.txt','r')
for user in anxietyfile:
    anxiety_members.append(user.strip())
    subfile = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfile = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
    anxiety_files.append(subfile)
    anxiety_files.append(comfile)
    
anxietyfile.close()

unanxiety_members = []
unanxiety_files = []

uncatfile = open('uncategorizedsubs.txt','w')

unanxietyfile = open('proven_all_undiagnosed.txt', 'r')
for user in unanxietyfile:
    unanxiety_members.append(user.strip())
    subfile = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfile = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
    unanxiety_files.append(subfile)
    unanxiety_files.append(comfile)
unanxietyfile.close()


for user in anxiety_members:
    subfilename = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfilename = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
    
    subfile = open(subfilename, 'r')
    for line in subfile:
        
        linelist = line.split(':;')
        if len(linelist) != 15:
            continue
        subreddit = linelist[10].strip()
        if not subreddit in sub_categories:
           
            if subreddit in sub_counts:
                sub_counts[subreddit] +=1
            else:
                sub_counts[subreddit] = 1
        
    subfile.close()
    
    comfile = open(comfilename, 'r')
    for line in comfile:
        linelist = line.split(':;')
        if len(linelist) != 12:
            continue
        subreddit = linelist[9].strip()
        if not subreddit in sub_categories:
            
            if subreddit in sub_counts:
                sub_counts[subreddit] +=1
            else:
                sub_counts[subreddit] = 1
       
    comfile.close()
   
for user in unanxiety_members:
    subfilename = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
    post_num = 0
    
    subfile = open(subfilename, 'r')
    for line in subfile:
        
        linelist = line.split(':;')
        if len(linelist) != 15:
            continue
        subreddit = linelist[10].strip()
        if not subreddit in sub_categories:
            
            if subreddit in sub_counts:
                sub_counts[subreddit] +=1
            else:
                sub_counts[subreddit] = 1
        
    subfile.close()
    
    comfilename = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
    comfile = open(comfilename, 'r')
    for line in comfile:
        linelist = line.split(':;')
        if len(linelist) != 12:
            continue
        subreddit = linelist[9].strip()
        if not subreddit in sub_categories:
            
            if subreddit in sub_counts:
                sub_counts[subreddit] +=1
            else:
                sub_counts[subreddit] = 1
    comfile.close()


sub_counts = dict(sorted(sub_counts.items(), key = lambda x:x[1]))

for count in sub_counts:
    if sub_counts[count] > 500:
        uncatfile.write(count)
        uncatfile.write('\n')
uncatfile.close()

In [None]:
from datetime import datetime, date, timedelta

def write_days(anxiety_members, unanxiety_members, extension, sublist):
    
    outname = 'dayfiles/dayfile_' + extension + '_all.txt'
    dname = 'dayfiles/dayfile_' + extension + '_diagnosed.txt'
    uname = 'dayfiles/dayfile_' + extension + '_undiagnosed.txt'
    outfile = open(outname, 'w')
    doutfile = open(dname, 'w')
    uoutfile = open(uname,'w')
    for user in anxiety_members:
        subfilename = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'

        earliestpost = float('inf')
        latestpost = 0.0
        earliestsubmission = float('inf')
        latestsubmission = 0.0
        earliestcomment = float('inf')
        latestcomment = 0.0
        days_posts = 0
        days_submissions = 0
        days_comments = 0
        rewrite_post = True
        rewrite_comment = True
        rewrite_submission = True

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if subreddit not in sublist:
                continue
            dtime = float(linelist[2])
            days_posts +=1
            days_submissions += 1
            if dtime < earliestsubmission:
                rewrite_submission = False
                rewrite_post = False
                earliestsubmission = dtime
            if dtime > latestsubmission:
               
                latestsubmission = dtime

        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if subreddit not in sublist:
                continue
            dtime = float(linelist[3])
            days_posts += 1
            days_comments += 1
            if dtime < earliestcomment:
                rewrite_post = False
                rewrite_comment = False
                earliestcomment = dtime
            if dtime > latestcomment:
                latestcomment = dtime

        comfile.close()
        
        
        
        if earliestcomment < earliestsubmission:
            earliestpost = earliestcomment
        else:
            earliestpost = earliestsubmission
            
        if latestcomment > latestsubmission:
            latestpost = latestcomment
        else:
            latestpost = latestsubmission
            
        if rewrite_comment:
            earliestcomment = 0.0
        if rewrite_submission:
            earliestsubmission = 0.0
        if rewrite_post:
            earliestpost = 0.0
        
        postdelta = datetime.fromtimestamp(int(float(latestpost))).date() - datetime.fromtimestamp(earliestpost).date()
        postdeltadays = postdelta.days 
       
        commentdelta = datetime.fromtimestamp(int(float(latestcomment))).date() - datetime.fromtimestamp(earliestcomment).date()
        commentdeltadays = commentdelta.days
        
        submissiondelta = datetime.fromtimestamp(int(float(latestsubmission))).date() - datetime.fromtimestamp(earliestsubmission).date()
        submissiondeltadays = submissiondelta.days
        
        postavg = 0.0
        subavg = 0.0
        comavg = 0.0
        if not rewrite_comment:
            commentdeltadays+=1
            comavg = float(days_comments)/float(commentdeltadays)
        if not rewrite_submission:
            submissiondeltadays += 1
            subavg = float(days_submissions)/float(submissiondeltadays)
        if not rewrite_post:
            postdeltadays += 1
            postavg = float(days_posts)/float(postdeltadays)
            
        
        outfile.write(user)
        
        outfile.write(':;')
        outfile.write(str(earliestpost))
        outfile.write(':;')
        outfile.write(str(latestpost))
        outfile.write(':;')
        outfile.write(str(postdeltadays))
        outfile.write(':;')
        
        outfile.write(str(earliestcomment))
        outfile.write(':;')
        outfile.write(str(latestcomment))
        outfile.write(':;')
        outfile.write(str(commentdeltadays))
        outfile.write(':;')
        
        outfile.write(str(earliestsubmission))
        outfile.write(':;')
        outfile.write(str(latestsubmission))
        outfile.write(':;')
        outfile.write(str(submissiondeltadays))
        
        outfile.write(':;')
        outfile.write(str(postavg))
        outfile.write(':;')
        outfile.write(str(comavg))
        outfile.write(':;')
        outfile.write(str(subavg))
        outfile.write('\n')
        
        doutfile.write(user)
        
        doutfile.write(':;')
        doutfile.write(str(earliestpost))
        doutfile.write(':;')
        doutfile.write(str(latestpost))
        doutfile.write(':;')
        doutfile.write(str(postdeltadays))
        doutfile.write(':;')
        
        doutfile.write(str(earliestcomment))
        doutfile.write(':;')
        doutfile.write(str(latestcomment))
        doutfile.write(':;')
        doutfile.write(str(commentdeltadays))
        doutfile.write(':;')
        
        doutfile.write(str(earliestsubmission))
        doutfile.write(':;')
        doutfile.write(str(latestsubmission))
        doutfile.write(':;')
        doutfile.write(str(submissiondeltadays))
        
        doutfile.write(':;')
        doutfile.write(str(postavg))
        doutfile.write(':;')
        doutfile.write(str(comavg))
        doutfile.write(':;')
        doutfile.write(str(subavg))
      
        doutfile.write('\n')
    
    doutfile.close()
    
    for user in unanxiety_members:
        subfilename = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'

        earliestpost = float('inf')
        latestpost = 0.0
        earliestsubmission = float('inf')
        latestsubmission = 0.0
        earliestcomment = float('inf')
        latestcomment = 0.0
        days_posts = 0
        days_submissions = 0
        days_comments = 0
        rewrite_post = True
        rewrite_comment = True
        rewrite_submission = True

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if subreddit not in sublist:
                continue
                
            dtime = float(linelist[2])
            days_posts += 1
            days_submissions +=1
            if dtime < earliestsubmission:
                rewrite_post = False
                rewrite_submission = False
                earliestsubmission = dtime
            if dtime > latestsubmission:
                latestsubmission = dtime

        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if subreddit not in sublist:
                continue
                
            dtime = float(linelist[3])
            days_posts += 1
            days_comments +=1
            if dtime < earliestcomment:
                rewrite_comment = False
                rewrite_post = False
                earliestcomment = dtime
            if dtime > latestcomment:
                latestcomment = dtime

        comfile.close()
        
        if earliestcomment < earliestsubmission:
            earliestpost = earliestcomment
        else:
            earliestpost = earliestsubmission
            
        if latestcomment > latestsubmission:
            latestpost = latestcomment
        else:
            latestpost = latestsubmission
        
        if rewrite_comment:
            earliestcomment = 0.0
        if rewrite_submission:
            earliestsubmission = 0.0
        if rewrite_post:
            earliestpost = 0.0
            
        postdelta = datetime.fromtimestamp(int(float(latestpost))).date() - datetime.fromtimestamp(earliestpost).date()
        postdeltadays = postdelta.days
        
        commentdelta = datetime.fromtimestamp(int(float(latestcomment))).date() - datetime.fromtimestamp(earliestcomment).date()
        commentdeltadays = commentdelta.days
        
        submissiondelta = datetime.fromtimestamp(int(float(latestsubmission))).date() - datetime.fromtimestamp(earliestsubmission).date()
        submissiondeltadays = submissiondelta.days
        
        postavg = 0.0
        subavg = 0.0
        comavg = 0.0
        if not rewrite_comment:
            commentdeltadays+=1
            comavg = float(days_comments)/float(commentdeltadays)
        if not rewrite_submission:
            submissiondeltadays += 1
            subavg = float(days_submissions)/float(submissiondeltadays)
        if not rewrite_post:
            postdeltadays += 1
            postavg = float(days_posts)/float(postdeltadays)
            
        outfile.write(user)
        
        outfile.write(':;')
        outfile.write(str(earliestpost))
        outfile.write(':;')
        outfile.write(str(latestpost))
        outfile.write(':;')
        outfile.write(str(postdeltadays))
        outfile.write(':;')
        
        outfile.write(str(earliestcomment))
        outfile.write(':;')
        outfile.write(str(latestcomment))
        outfile.write(':;')
        outfile.write(str(commentdeltadays))
        outfile.write(':;')
        
        outfile.write(str(earliestsubmission))
        outfile.write(':;')
        outfile.write(str(latestsubmission))
        outfile.write(':;')
        outfile.write(str(submissiondeltadays))
        
        outfile.write(':;')
        outfile.write(str(postavg))
        outfile.write(':;')
        outfile.write(str(comavg))
        outfile.write(':;')
        outfile.write(str(subavg))
        
        outfile.write('\n')
        
        uoutfile.write(user)
        
        uoutfile.write(':;')
        uoutfile.write(str(earliestpost))
        uoutfile.write(':;')
        uoutfile.write(str(latestpost))
        uoutfile.write(':;')
        uoutfile.write(str(postdeltadays))
        uoutfile.write(':;')
        
        uoutfile.write(str(earliestcomment))
        uoutfile.write(':;')
        uoutfile.write(str(latestcomment))
        uoutfile.write(':;')
        uoutfile.write(str(commentdeltadays))
        uoutfile.write(':;')
        
        uoutfile.write(str(earliestsubmission))
        uoutfile.write(':;')
        uoutfile.write(str(latestsubmission))
        uoutfile.write(':;')
        uoutfile.write(str(submissiondeltadays))
        
        uoutfile.write(':;')
        uoutfile.write(str(postavg))
        uoutfile.write(':;')
        uoutfile.write(str(comavg))
        uoutfile.write(':;')
        uoutfile.write(str(subavg))
        
        uoutfile.write('\n')
        
    uoutfile.close()
    outfile.close()
    
#Format
#0     1             2           3         4                5              6            7                   8                
#user:;earliestpost:;latestpost:;postdays:;earliestcomment:;latestcomment:;commentdays:;earliestsubmission:;latestsubmission:;

#9               10       11      12
#submissiondays:;postavg:;comavg:;subavg

In [None]:
### def get_post_days():
    ddict = {}
    udict = {}
    
    ddayfile = open('dayfiles/dayfile_diagnosed.txt','r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[3])
    ddayfile.close()
    
    udayfile = open('dayfiles/dayfile_undiagnosed.txt', 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[3])
    udayfile.close()
    
    return ddict, udict

def get_post_days(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[3])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[3])
    udayfile.close()
    
    return ddict, udict

def get_anxiety_post_days(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_anxiety_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[3])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_anxiety_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[3])
    udayfile.close()
    
    return ddict, udict

def get_depression_post_days(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_depression_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[3])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_depression_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[3])
    udayfile.close()
    
    return ddict, udict

def get_submission_days():
    ddict = {}
    udict = {}
    
    ddayfile = open('dayfiles/dayfile_diagnosed.txt','r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[9])
    ddayfile.close()
    
    udayfile = open('dayfiles/dayfile_undiagnosed.txt', 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[9])
    udayfile.close()
    
    return ddict, udict

def get_submission_days(extension):
    ddict = {}
    udict = {}
    
    ddayname = 'dayfiles/dayfile_' + extension + '_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[9])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[9])
    udayfile.close()
    
    return ddict, udict

def get_anxiety_submission_days(extension):
    ddict = {}
    udict = {}
    
    ddayname = 'dayfiles/dayfile_' + extension + '_anxiety_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[9])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_anxiety_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[9])
    udayfile.close()
    
    return ddict, udict

def get_depression_submission_days(extension):
    ddict = {}
    udict = {}
    
    ddayname = 'dayfiles/dayfile_' + extension + '_depression_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[9])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_depression_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[9])
    udayfile.close()
    
    return ddict, udict

def get_comment_days():
    ddict = {}
    udict = {}
    
    ddayfile = open('dayfiles/dayfile_diagnosed.txt','r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[6])
    ddayfile.close()
    
    udayfile = open('dayfiles/dayfile_undiagnosed.txt', 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[6])
    udayfile.close()
    
    return ddict, udict

def get_comment_days(extension):
    ddict = {}
    udict = {}
    
    ddayname = 'dayfiles/dayfile_' + extension + '_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[6])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[6])
    udayfile.close()
    
    return ddict, udict

def get_anxiety_comment_days(extension):
    ddict = {}
    udict = {}
    
    ddayname = 'dayfiles/dayfile_' + extension + '_anxiety_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[6])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_anxiety_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[6])
    udayfile.close()
    
    return ddict, udict

def get_depression_comment_days(extension):
    ddict = {}
    udict = {}
    
    ddayname = 'dayfiles/dayfile_' + extension + '_depression_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = int(linelist[6])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_depression_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = int(linelist[6])
    udayfile.close()
    
    return ddict, udict

def get_post_avgs():
    
    ddict = {}
    udict = {}
    
    ddayfile = open('dayfiles/dayfile_diagnosed.txt','r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[10])
    ddayfile.close()
    
    udayfile = open('dayfiles/dayfile_undiagnosed.txt', 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[10])
    udayfile.close()
    
    return ddict, udict

def get_post_avgs(extension):
    
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[10])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[10])
    udayfile.close()
    
    return ddict, udict

def get_anxiety_post_avgs(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_anxiety_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[10])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_anxiety_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[10])
    udayfile.close()
    
    return ddict, udict

def get_depression_post_avgs(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_depression_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[10])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_depression_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[10])
    udayfile.close()
    
    return ddict, udict

def get_comment_avgs():
    
    ddict = {}
    udict = {}
    
    ddayfile = open('dayfiles/dayfile_diagnosed.txt','r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[11])
    ddayfile.close()
    
    udayfile = open('dayfiles/dayfile_undiagnosed.txt', 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[11])
    udayfile.close()
    
    return ddict, udict

def get_comment_avgs(extension):
    
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[11])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[11])
    udayfile.close()
    
    return ddict, udict

def get_anxiety_comment_avgs(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_anxiety_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[11])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_anxiety_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[11])
    udayfile.close()
    
    return ddict, udict

def get_depression_comment_avgs(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_depression_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[11])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_depression_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[11])
    udayfile.close()
    
    return ddict, udict

def get_submission_avgs():
    
    ddict = {}
    udict = {}
    
    ddayfile = open('dayfiles/dayfile_diagnosed.txt','r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[12])
    ddayfile.close()
    
    udayfile = open('dayfiles/dayfile_undiagnosed.txt', 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[12])
    udayfile.close()
    
    return ddict, udict

def get_submission_avgs(extension):
    
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[12])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[12])
    udayfile.close()
    
    return ddict, udict

def get_anxiety_submission_avgs(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_anxiety_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[12])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_anxiety_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[12])
    udayfile.close()
    
    return ddict, udict
    
def get_depression_submission_avgs(extension):
    ddict = {}
    udict = {}
    ddayname = 'dayfiles/dayfile_' + extension + '_depression_diagnosed.txt'
    ddayfile = open(ddayname,'r')
    for line in ddayfile:
        linelist = line.split(':;')
        user = linelist[0]
        ddict[user] = float(linelist[12])
    ddayfile.close()
    
    udayname = 'dayfiles/dayfile_' + extension + '_depression_undiagnosed.txt'
    udayfile = open(udayname, 'r')
    for line in udayfile:
        linelist = line.split(':;')
        user = linelist[0]
        udict[user] = float(linelist[12])
    udayfile.close()
    
    return ddict, udict

    

In [None]:
def daywrite(word, sublist):
    dlist, ulist = get_all_users()
    write_days(dlist, ulist, word, sublist)
    
    dlist, ulist = get_depression_users()
    depressionword = '' + word + '_depression'
    write_days(dlist, ulist, depressionword, sublist)
    
    dlist, ulist = get_anxiety_users()
    anxietyword = '' + word + '_anxiety'
    write_days(dlist, ulist, anxietyword, sublist)
    
def graphing_stuff():
    extension_list = ['all','exclusive','mental','physical','hobbies','general','topical']
    ksfile = open('analysis_results/ksvalues.txt', 'w')
    percentilefile = open('analysis_results/percentiles.txt', 'w')
    for extension in extension_list:
        
        #post avgs
        diagnosed_post_avgs, undiagnosed_post_avgs = get_post_avgs(extension)
        diagnosed_post_avgs = list(diagnosed_post_avgs.values())
        undiagnosed_post_avgs = list(undiagnosed_post_avgs.values())
        
        anxiety_post_avgs, unanxiety_post_avgs = get_anxiety_post_avgs(extension)
        anxiety_post_avgs = list(anxiety_post_avgs.values())
        unanxiety_post_avgs = list(unanxiety_post_avgs.values())
        
        depression_post_avgs, undepression_post_avgs = get_depression_post_avgs(extension)
        depression_post_avgs = list(depression_post_avgs.values())
        undepression_post_avgs = list(undepression_post_avgs.values())
        
        allfilename = 'analysis_results/PostAvgs_all_' + extension + '.pdf'
        anxietyfilename = 'analysis_results/PostAvgs_anxiety_' + extension + '.pdf'
        depressionfilename = 'analysis_results/PostAvgs_depression_' + extension + '.pdf'
        postavg_graph = CDF3(diagnosed_post_avgs, undiagnosed_post_avgs, 'Posts per Day', allfilename,'')
        anxiety_postavg_graph = CDF3(anxiety_post_avgs,unanxiety_post_avgs,'Posts per Day',anxietyfilename,'anxiety')
        depression_postavg_graph = CDF3(depression_post_avgs, undepression_post_avgs, 'Posts per Day', depressionfilename, 'depression')
        
        f1name = '' + extension + '_PostAvgs_all:;'
        f2name = '' + extension + '_PostAvgs_anxiety:;'
        f3name = '' + extension + '_PostAvgs_depression:;'
        
        ksfile.write(f1name)
        ksfile.write(str(postavg_graph.pvalue))
        ksfile.write('\n')
        ksfile.write(f2name)
        ksfile.write(str(anxiety_postavg_graph.pvalue))
        ksfile.write('\n')
        ksfile.write(f3name)
        ksfile.write(str(depression_postavg_graph.pvalue))
        ksfile.write('\n')
        
        diagnosed_post_avgs = np.sort(diagnosed_post_avgs)
        undiagnosed_post_avgs = np.sort(undiagnosed_post_avgs)
        anxiety_post_avgs = np.sort(anxiety_post_avgs)
        unanxiety_post_avgs = np.sort(unanxiety_post_avgs)
        depression_post_avgs = np.sort(depression_post_avgs)
        undepression_post_avgs = np.sort(undepression_post_avgs)
        
        percentilefile.write(f1name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(diagnosed_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(undiagnosed_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[-1],2)))
        percentilefile.write('\n')
        
        percentilefile.write(f2name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(anxiety_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(unanxiety_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[-1],2)))
        percentilefile.write('\n')
        
        percentilefile.write(f3name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(depression_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(undepression_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[-1],2)))
        percentilefile.write('\n')
        
        
        #submission avgs
        diagnosed_post_avgs, undiagnosed_post_avgs = get_submission_avgs(extension)
        diagnosed_post_avgs = list(diagnosed_post_avgs.values())
        undiagnosed_post_avgs = list(undiagnosed_post_avgs.values())
        
        anxiety_post_avgs, unanxiety_post_avgs = get_anxiety_submission_avgs(extension)
        anxiety_post_avgs = list(anxiety_post_avgs.values())
        unanxiety_post_avgs = list(unanxiety_post_avgs.values())
        
        depression_post_avgs, undepression_post_avgs = get_depression_submission_avgs(extension)
        depression_post_avgs = list(depression_post_avgs.values())
        undepression_post_avgs = list(undepression_post_avgs.values())
        
        allfilename = 'analysis_results/SubmissionAvgs_all_' + extension + '.pdf'
        anxietyfilename = 'analysis_results/SubmissionAvgs_anxiety_' + extension + '.pdf'
        depressionfilename = 'analysis_results/SubmissionAvgs_depression_' + extension + '.pdf'
        postavg_graph = CDF3(diagnosed_post_avgs, undiagnosed_post_avgs, 'Submissions per Day', allfilename,'')
        anxiety_postavg_graph = CDF3(anxiety_post_avgs,unanxiety_post_avgs,'Submissions per Day',anxietyfilename,'anxiety')
        depression_postavg_graph = CDF3(depression_post_avgs, undepression_post_avgs, 'Submissions per Day', depressionfilename, 'depression')
        
        f1name = '' + extension + '_SubmissionAvgs_all:;'
        f2name = '' + extension + '_SubmissionAvgs_anxiety:;'
        f3name = '' + extension + '_SubmissionAvgs_depression:;'
        
        ksfile.write(f1name)
        ksfile.write(str(postavg_graph.pvalue))
        ksfile.write('\n')
        ksfile.write(f2name)
        ksfile.write(str(anxiety_postavg_graph.pvalue))
        ksfile.write('\n')
        ksfile.write(f3name)
        ksfile.write(str(depression_postavg_graph.pvalue))
        ksfile.write('\n')
        
        diagnosed_post_avgs = np.sort(diagnosed_post_avgs)
        undiagnosed_post_avgs = np.sort(undiagnosed_post_avgs)
        anxiety_post_avgs = np.sort(anxiety_post_avgs)
        unanxiety_post_avgs = np.sort(unanxiety_post_avgs)
        depression_post_avgs = np.sort(depression_post_avgs)
        undepression_post_avgs = np.sort(undepression_post_avgs)
        
        percentilefile.write(f1name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(diagnosed_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(undiagnosed_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[-1],2)))
        percentilefile.write('\n')
        
        percentilefile.write(f2name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(anxiety_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(unanxiety_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[-1],2)))
        percentilefile.write('\n')
        
        percentilefile.write(f3name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(depression_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(undepression_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[-1],2)))
        percentilefile.write('\n')
        
       
    #comment avgs
        diagnosed_post_avgs, undiagnosed_post_avgs = get_comment_avgs(extension)
        diagnosed_post_avgs = list(diagnosed_post_avgs.values())
        undiagnosed_post_avgs = list(undiagnosed_post_avgs.values())
        
        anxiety_post_avgs, unanxiety_post_avgs = get_anxiety_comment_avgs(extension)
        anxiety_post_avgs = list(anxiety_post_avgs.values())
        unanxiety_post_avgs = list(unanxiety_post_avgs.values())
        
        depression_post_avgs, undepression_post_avgs = get_depression_comment_avgs(extension)
        depression_post_avgs = list(depression_post_avgs.values())
        undepression_post_avgs = list(undepression_post_avgs.values())
        
        allfilename = 'analysis_results/CommentAvgs_all_' + extension + '.pdf'
        anxietyfilename = 'analysis_results/CommentAvgs_anxiety_' + extension + '.pdf'
        depressionfilename = 'analysis_results/CommentAvgs_depression_' + extension + '.pdf'
        postavg_graph = CDF3(diagnosed_post_avgs, undiagnosed_post_avgs, 'Comments per Day', allfilename,'')
        anxiety_postavg_graph = CDF3(anxiety_post_avgs,unanxiety_post_avgs,'Comments per Day',anxietyfilename,'anxiety')
        depression_postavg_graph = CDF3(depression_post_avgs, undepression_post_avgs, 'Comments per Day', depressionfilename, 'depression')
        
        f1name = '' + extension + '_CommentAvgs_all:;'
        f2name = '' + extension + '_CommentAvgs_anxiety:;'
        f3name = '' + extension + '_CommentAvgs_depression:;'
        
        ksfile.write(f1name)
        ksfile.write(str(postavg_graph.pvalue))
        ksfile.write('\n')
        ksfile.write(f2name)
        ksfile.write(str(anxiety_postavg_graph.pvalue))
        ksfile.write('\n')
        ksfile.write(f3name)
        ksfile.write(str(depression_postavg_graph.pvalue))
        ksfile.write('\n')
        
        
        diagnosed_post_avgs = np.sort(diagnosed_post_avgs)
        undiagnosed_post_avgs = np.sort(undiagnosed_post_avgs)
        anxiety_post_avgs = np.sort(anxiety_post_avgs)
        unanxiety_post_avgs = np.sort(unanxiety_post_avgs)
        depression_post_avgs = np.sort(depression_post_avgs)
        undepression_post_avgs = np.sort(undepression_post_avgs)
        
        
        percentilefile.write(f1name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(diagnosed_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[int(len(diagnosed_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(diagnosed_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(undiagnosed_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[int(len(undiagnosed_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undiagnosed_post_avgs[-1],2)))
        percentilefile.write('\n')
        
        percentilefile.write(f2name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(anxiety_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[int(len(anxiety_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(anxiety_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(unanxiety_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[int(len(unanxiety_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(unanxiety_post_avgs[-1],2)))
        percentilefile.write('\n')
        
        percentilefile.write(f3name)
        percentilefile.write('Diagnosed:;')
        percentilefile.write(str(round(depression_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.90)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[int(len(depression_post_avgs)* 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(depression_post_avgs[-1],2)))
        percentilefile.write(':;')
        percentilefile.write('Undiagnosed:;')
        percentilefile.write(str(round(undepression_post_avgs[0],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.5)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.75)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.9)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[int(len(undepression_post_avgs) * 0.95)],2)))
        percentilefile.write(':;')
        percentilefile.write(str(round(undepression_post_avgs[-1],2)))
        percentilefile.write('\n')
        
    ksfile.close()
    percentilefile.close()

In [None]:
graphing_stuff()

In [None]:
daywrite('all', all_list)#all
daywrite('exclusive', exclusive_list)#exclusive
daywrite('mental', mental_list)#mental
daywrite('physical', physical_list)#physical
daywrite('hobbies', hobbies_list)#hobbies
daywrite('general', general_list)#general
daywrite('topical', topical_list)#topical

In [None]:
dpostdays, upostdays = get_anxiety_post_days('all')
print(len(dpostdays))
print(len(upostdays))


In [None]:
#upostsorted = np.sort(upostdays)

over10 = []

for user in upostdays:
    if upostdays[user] > 39.9:
        over10.append(user)
print(over10)

In [None]:
postlist = list(dpostdays.values())
postlist = np.sort(postlist)
print(f'Min value = {postlist[0]}')
print(f'Max value = {postlist[-1]}')

postlist = list(upostdays.values())
postlist = np.sort(postlist)
print(f'Min value = {postlist[0]}')
print(f'Max value = {postlist[-1]}')


In [None]:
#Format:
#0     1            2               3         4          5                   6                      7
#user:;numcomments:;numsubmissions:;numposts:;wordcount:;avgwordspercomment:;avgwordspersubmission:;avgwordsperpost

def write_counts2(anxiety_members, unanxiety_members):
    over10k = [] #store all users with more than 10k posts
    #populate list of files to process for diagnosed members of r/anxiety


    total_post_counts = {} #stores number of posts per user
    d_post_counts = {} 
    u_post_counts = {}
  
    dstemmed, ustemmed = get_stemmed_data('all')
    dwordcount = {}
    uwordcount = {}
    totalwordcount = {}
    
    for user in dstemmed:
        wordcount = len(dstemmed[user].split())
        dwordcount[user] = wordcount
        totalwordcount[user] = wordcount
    for user in ustemmed:
        wordcount = len(ustemmed[user].split())
        uwordcount[user] = wordcount
        totalwordcount[user] = wordcount


    for user in anxiety_members:
        subfilename = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'

        post_num = 0
        submissions = 0
        comments = 0

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            post_num += 1
            submissions +=1

        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            post_num+=1
            comments +=1

        comfile.close()
        if post_num > 10000:
            over10k.append(user)
            #print(f'{user} post num == : {post_num}')
        total_post_counts[user] = [comments,submissions,post_num]
        d_post_counts[user] = [comments,submissions,post_num]
    for user in unanxiety_members:
        subfilename = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        
        post_num = 0
        comments = 0
        submissions = 0

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            post_num += 1
            submissions +=1

        subfile.close()

        comfilename = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            post_num+=1
            comments +=1

        comfile.close()
        if post_num > 10000:
            over10k.append(user)
            #print(f'{user} post num == : {post_num}')
        total_post_counts[user] = [comments,submissions,post_num]
        u_post_counts[user] = [comments,submissions,post_num]
    
   
    tfile = open('total_counts.txt', 'w')
    dfile = open('diagnosed_counts.txt', 'w')
    for user in anxiety_members:
        dfile.write(user)
        tfile.write(user)
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(d_post_counts[user][0]))
        tfile.write(str(d_post_counts[user][0]))
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(d_post_counts[user][1]))
        tfile.write(str(d_post_counts[user][1]))
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(d_post_counts[user][2]))
        tfile.write(str(d_post_counts[user][2]))
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(dwordcount[user]))
        tfile.write(str(dwordcount[user]))
        dfile.write(':;')
        tfile.write(':;')
        
        if int(d_post_counts[user][0]) == 0:
            dfile.write('0')
            tfile.write('0')
            dfile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(dwordcount[user]) / float(d_post_counts[user][0]), 2)
            dfile.write(str(avgwords))
            tfile.write(str(avgwords))
            dfile.write(':;')
            tfile.write(':;')
        if int(d_post_counts[user][1]) == 0:
            dfile.write('0')
            tfile.write('0')
            dfile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(dwordcount[user]) / float(d_post_counts[user][1]), 2)
            dfile.write(str(avgwords))
            tfile.write(str(avgwords))
            dfile.write(':;')
            tfile.write(':;')
        if int(d_post_counts[user][2]) == 0:
            dfile.write('0')
            tfile.write('0')
        else:    
            avgwords = round(float(dwordcount[user]) / float(d_post_counts[user][2]), 2)
            dfile.write(str(avgwords))
            tfile.write(str(avgwords))
        
        dfile.write('\n')
        tfile.write('\n')
    dfile.close()
    
    ufile = open('undiagnosed_counts.txt', 'w')
    for user in unanxiety_members:
        ufile.write(user)
        tfile.write(user)
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(u_post_counts[user][0]))
        tfile.write(str(u_post_counts[user][0]))
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(u_post_counts[user][1]))
        tfile.write(str(u_post_counts[user][1]))
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(u_post_counts[user][2]))
        tfile.write(str(u_post_counts[user][2]))
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(uwordcount[user]))
        tfile.write(str(uwordcount[user]))
        ufile.write(':;')
        tfile.write(':;')
        
        if int(u_post_counts[user][0]) == 0:
            ufile.write('0')
            tfile.write('0')
            ufile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(uwordcount[user]) / float(u_post_counts[user][0]), 2)
            ufile.write(str(avgwords))
            tfile.write(str(avgwords))
            ufile.write(':;')
            tfile.write(':;')
            
        if int(u_post_counts[user][1]) == 0:
            ufile.write('0')
            tfile.write('0')
            ufile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(uwordcount[user]) / float(u_post_counts[user][1]), 2)
            ufile.write(str(avgwords))
            tfile.write(str(avgwords))
            ufile.write(':;')
            tfile.write(':;')
            
        if int(u_post_counts[user][2]) == 0:
            ufile.write('0')
            tfile.write('0')
        else:    
            avgwords = round(float(uwordcount[user]) / float(u_post_counts[user][2]), 2)
            ufile.write(str(avgwords))
            tfile.write(str(avgwords))
        
        ufile.write('\n')
        tfile.write('\n')
    ufile.close()
   
    tfile.close()
    return over10k

#Format:
#0     1            2               3         4          5                   6                      7
#user:;numcomments:;numsubmissions:;numposts:;wordcount:;avgwordspercomment:;avgwordspersubmission:;avgwordsperpost

def write_counts3(anxiety_members, unanxiety_members):
    over10k = [] #store all users with more than 10k posts
    #populate list of files to process for diagnosed members of r/anxiety


    total_post_counts = {} #stores number of posts per user
    d_post_counts = {} 
    u_post_counts = {}
  
    dstemmed, ustemmed = get_stemmed_data('all')
    dwordcount = {}
    uwordcount = {}
    totalwordcount = {}
    
    for user in dstemmed:
        wordcount = len(dstemmed[user].split())
        dwordcount[user] = wordcount
        totalwordcount[user] = wordcount
    for user in ustemmed:
        wordcount = len(ustemmed[user].split())
        uwordcount[user] = wordcount
        totalwordcount[user] = wordcount


    for user in anxiety_members:
        subfilename = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'

        post_num = 0
        submissions = 0
        comments = 0

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            post_num += 1
            submissions +=1

        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            post_num+=1
            comments +=1

        comfile.close()
        if post_num > 10000:
            over10k.append(user)
            #print(f'{user} post num == : {post_num}')
        total_post_counts[user] = [comments,submissions,post_num]
        d_post_counts[user] = [comments,submissions,post_num]
    for user in unanxiety_members:
        subfilename = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        
        post_num = 0
        comments = 0
        submissions = 0

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            post_num += 1
            submissions +=1

        subfile.close()

        comfilename = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            post_num+=1
            comments +=1

        comfile.close()
        if post_num > 10000:
            over10k.append(user)
            #print(f'{user} post num == : {post_num}')
        total_post_counts[user] = [comments,submissions,post_num]
        u_post_counts[user] = [comments,submissions,post_num]
    
   
    tfile = open('total_counts.txt', 'w')
    dfile = open('diagnosed_counts.txt', 'w')
    for user in anxiety_members:
        dfile.write(user)
        tfile.write(user)
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(d_post_counts[user][0]))
        tfile.write(str(d_post_counts[user][0]))
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(d_post_counts[user][1]))
        tfile.write(str(d_post_counts[user][1]))
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(d_post_counts[user][2]))
        tfile.write(str(d_post_counts[user][2]))
        dfile.write(':;')
        tfile.write(':;')
        dfile.write(str(dwordcount[user]))
        tfile.write(str(dwordcount[user]))
        dfile.write(':;')
        tfile.write(':;')
        
        if int(d_post_counts[user][0]) == 0:
            dfile.write('0')
            tfile.write('0')
            dfile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(dwordcount[user]) / float(d_post_counts[user][0]), 2)
            dfile.write(str(avgwords))
            tfile.write(str(avgwords))
            dfile.write(':;')
            tfile.write(':;')
        if int(d_post_counts[user][1]) == 0:
            dfile.write('0')
            tfile.write('0')
            dfile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(dwordcount[user]) / float(d_post_counts[user][1]), 2)
            dfile.write(str(avgwords))
            tfile.write(str(avgwords))
            dfile.write(':;')
            tfile.write(':;')
        if int(d_post_counts[user][2]) == 0:
            dfile.write('0')
            tfile.write('0')
        else:    
            avgwords = round(float(dwordcount[user]) / float(d_post_counts[user][2]), 2)
            dfile.write(str(avgwords))
            tfile.write(str(avgwords))
        
        dfile.write('\n')
        tfile.write('\n')
    dfile.close()
    
    ufile = open('undiagnosed_counts.txt', 'w')
    for user in unanxiety_members:
        ufile.write(user)
        tfile.write(user)
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(u_post_counts[user][0]))
        tfile.write(str(u_post_counts[user][0]))
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(u_post_counts[user][1]))
        tfile.write(str(u_post_counts[user][1]))
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(u_post_counts[user][2]))
        tfile.write(str(u_post_counts[user][2]))
        ufile.write(':;')
        tfile.write(':;')
        ufile.write(str(uwordcount[user]))
        tfile.write(str(uwordcount[user]))
        ufile.write(':;')
        tfile.write(':;')
        
        if int(u_post_counts[user][0]) == 0:
            ufile.write('0')
            tfile.write('0')
            ufile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(uwordcount[user]) / float(u_post_counts[user][0]), 2)
            ufile.write(str(avgwords))
            tfile.write(str(avgwords))
            ufile.write(':;')
            tfile.write(':;')
            
        if int(u_post_counts[user][1]) == 0:
            ufile.write('0')
            tfile.write('0')
            ufile.write(':;')
            tfile.write(':;')
        else:
            avgwords = round(float(uwordcount[user]) / float(u_post_counts[user][1]), 2)
            ufile.write(str(avgwords))
            tfile.write(str(avgwords))
            ufile.write(':;')
            tfile.write(':;')
            
        if int(u_post_counts[user][2]) == 0:
            ufile.write('0')
            tfile.write('0')
        else:    
            avgwords = round(float(uwordcount[user]) / float(u_post_counts[user][2]), 2)
            ufile.write(str(avgwords))
            tfile.write(str(avgwords))
        
        ufile.write('\n')
        tfile.write('\n')
    ufile.close()
   
    tfile.close()
    return over10k

In [None]:
#Format:
#0     1            2               3         4          5                   6                      7
#user:;numcomments:;numsubmissions:;numposts:;wordcount:;avgwordspercomment:;avgwordspersubmission:;avgwordsperpost
def get_counts2(extension):
    countfilename = '' + extension + '_counts.txt'
    countfile = open(countfilename,'r')
    commentcounts = []
    subcounts = []
    postcounts = []
    wordcounts = []
    avgperpost = []
    avgpercomment = []
    avgpersubmission = []
    
    for line in countfile:
        linelist = line.split(':;')
        if len(linelist) != 8:
            print(f'read error: length = {len(linelist)}')
            break
        commentcounts.append(int(linelist[1]))
        subcounts.append(int(linelist[2]))
        postcounts.append(int(linelist[3]))
        wordcounts.append(int(linelist[4]))
        avgpercomment.append(float(linelist[5]))
        avgpersubmission.append(float(linelist[6]))
        avgperpost.append(float(linelist[7]))
    countfile.close()
    return np.sort(commentcounts), np.sort(subcounts), np.sort(postcounts), np.sort(wordcounts), np.sort(avgperpost), np.sort(avgpercomment), np.sort(avgpersubmission)

    

In [None]:
dlist, ulist = get_all_users()
over10 = write_counts2(dlist,ulist)
print(over10)

In [None]:
ccounts, scounts, pcounts, wcounts, postavg, comavg, subavg = get_counts2('total')

In [None]:
dccounts, dscounts, dpcounts, dwcounts, dpostavg, dcomavg, dsubavg = get_counts2('diagnosed')
uccounts, uscounts, upcounts, uwcounts, upostavg, ucomavg, usubavg = get_counts2('undiagnosed')

In [None]:
#test
print(len(ccounts))
print(len(scounts))
print(len(pcounts))
print(len(wcounts))
print(len(postavg))
print(len(comavg))
print(len(subavg))

print(len(dccounts))
print(len(dscounts))
print(len(dpcounts))
print(len(dwcounts))
print(len(dpostavg))
print(len(dcomavg))
print(len(dsubavg))

print(len(uccounts))
print(len(uscounts))
print(len(upcounts))
print(len(uwcounts))
print(len(upostavg))
print(len(ucomavg))
print(len(usubavg))

In [None]:
#find outliers see what to do with them

maxvalue = dpcounts[-1]



In [None]:
### CDF part 2
import numpy as np
import matplotlib.pyplot as plt

# getting data of the histogram
count, bins_count = np.histogram(total_post_counts, bins=10)
  
# finding the PDF of the histogram using count values
pdf = count / sum(count)
  
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
  
# plotting PDF and CDF
plt.plot(bins_count[1:], pdf, color="red", label="PDF")
plt.plot(bins_count[1:], cdf, label="CDF")
plt.legend()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import ks_2samp

def CDF(toplot, xaxis, title, filename):
    N = len(toplot)  
    x = np.sort(toplot)

    # get the cdf values of y
    y = np.arange(N) / float(N)
    percentile95 = int(N*0.95)
    print(x[percentile95])
    # plotting
    plt.xlabel(xaxis)
    plt.ylabel('CDF')
    plt.xticks(np.arange(0,int(x[-1]), 1000), rotation=90)
    plt.title(title)

    plt.plot(x, y)
    plt.savefig(filename, facecolor = 'white')
    
def CDF2(toplot1, toplot2, xaxis, title, filename):
    N1 = len(toplot1)  
    x1 = np.sort(toplot1)
    N2 = len(toplot2)  
    x2 = np.sort(toplot2)

    # get the cdf values of y
    y1 = np.arange(N1) / float(N1)
    y2 = np.arange(N2) / float(N2)
    print(np.shape(x1))
    print(np.shape(y1))
    percentile95_1= int(N1*0.95)
    print(f'{x1[percentile95_1]}')
    percentile95_2 = int(N2 *0.95)
    print(f'{x2[percentile95_2]}')
    # plotting
    
  
    plt.xlabel(xaxis)
    plt.ylabel('CDF')
    end = max(x1[-1], x2[-1])
   # plt.xticks(np.arange(0,end,500), rotation=90)

    plt.title(title)

    plt.plot(x1, y1, label = 'Diagnosed')
    plt.plot(x2, y2, label = 'Undiagnosed')
    plt.legend()
    plt.savefig(filename, facecolor = 'white')
    print(f'x1 length = {len(x1)}')
    print(f'x2 length = {len(x2)}')
    return ks_2samp(x1, x2)
    
def CDF3(toplot1, toplot2, xaxis, filename, status):
    N1 = len(toplot1)  
    x1 = np.sort(list(toplot1))[:int(N1*1)]
    N2 = len(toplot2)  
    x2 = np.sort(list(toplot2))[:int(N2*1)]
    N1 = len(x1)
    N2 = len(x2)
    # get the cdf values of y
    y1 = np.arange(N1) / float(N1)
    y2 = np.arange(N2) / float(N2)
   # print(np.shape(x1))
    #print(np.shape(y1))
    percentile95_1= int(N1*0.95)
   # print(f'{x1[percentile95_1]}')
    percentile95_2 = int(N2 *0.95)
    #print(f'{x2[percentile95_2]}')
    # plotting
    
    plt.clf()
    plt.xlabel(xaxis)
    plt.ylabel('CDF')
    #end = max(x1[-1], x2[-1])
   # plt.xticks(np.arange(0,end,500), rotation=90)
    label1 = 'Diagnosed'
    label2 = 'Undiagnosed'
    if status == 'depression':
        label1 = 'Depressed'
        label2 = 'Not Depressed'
    elif status == 'anxiety':
        label1 = 'Anxious'
        label2 = 'Not Anxious'
    
    plt.plot(x1, y1, label = label1)
    plt.plot(x2, y2, label = label2)
    plt.legend()
    plt.savefig(filename, facecolor = 'white')
   # print(f'x1 length = {len(x1)}')
    #print(f'x2 length = {len(x2)}')
    return ks_2samp(x1, x2)

In [None]:
def print_percentiles(count_list):
    first = count_list[0]
    twenty5 = count_list[int(len(count_list)*0.25)]
    fifty = count_list[int(len(count_list)*0.50)]
    seventy5 =count_list[int(len(count_list)*0.75)]
    ninety5 = count_list[int(len(count_list)*0.95)]
    ninety9 = count_list[int(len(count_list)*0.99)]
    last = count_list[-1]
    
    print(f'Minimum Value: {first}')
    print(f'25th Percentile Value: {twenty5}')
    print(f'Median: {fifty}')
    print(f'75th Percentile Value: {seventy5}')
    print(f'95th Percentile Value: {ninety5}')
    print(f'99th Percentile Value: {ninety9}')
    print(f'Maximum Value: {last}')

In [None]:
print_percentiles(dpcounts)
print(dpcounts[-2])

In [None]:
plt.boxplot(dpcounts[:int(len(dpcounts)*0.95)])

In [None]:
from datetime import datetime

def postnum_stuff(disorder, include_list, option):
    
    dpostdict = {}
    upostdict = {}
    allpostdict = {}
    dcommentdict = {}
    dsubmissiondict = {}
    ucommentdict = {}
    usubmissiondict = {}
    allcommentdict = {}
    allsubmissiondict = {}
    
    anxiety_members, unanxiety_members = get_all_users()
    
    if disorder == 'anxiety':
        anxiety_members, unanxiety_members = get_anxiety_users()
    elif disorder == 'depression':
        anxiety_members, unanxiety_members = get_depression_users()
        
    earliest = 0
    latest = 0

    for user in anxiety_members:
        subfilename = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'
        dpostdict[user] = 0
        dcommentdict[user] = 0
        dsubmissiondict[user] = 0
        allpostdict[user] = 0
        allsubmissiondict[user] = 0
        allcommentdict[user] = 0
        
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            
            allpostdict[user] +=1
            dpostdict[user] +=1
            dsubmissiondict[user] +=1
            allsubmissiondict[user] += 1
        
        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            
            allpostdict[user] +=1
            dpostdict[user] +=1
            dcommentdict[user] +=1
            allcommentdict[user] +=1

        comfile.close()

    for user in unanxiety_members:
        upostdict[user] = 0
        ucommentdict[user] = 0
        usubmissiondict[user] = 0
        allpostdict[user] = 0
        allsubmissiondict[user] = 0
        allcommentdict[user] = 0
        
        subfilename = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        post_num = 0
        
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            upostdict[user] += 1
            usubmissiondict[user] +=1
            allpostdict[user] += 1
            allsubmissiondict[user] += 1
           
        subfile.close()

        comfilename = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            upostdict[user] += 1
            ucommentdict[user] +=1
            allpostdict[user] += 1
            allcommentdict[user] += 1
            
        comfile.close()
        
    if option == 'comments':
        return allcommentdict, dcommentdict, ucommentdict
    elif option == 'submissions':
        return allsubmissiondict, dsubmissiondict, usubmissiondict
    else:
        return allpostdict, dpostdict, upostdict
    
    
def postlen_stuff(disorder, include_list, option):
    
    dpostdict = {}
    upostdict = {}

    dcommentdict = {}
    dsubmissiondict = {}
    ucommentdict = {}
    usubmissiondict = {}
    
    dposts = {}
    uposts = {}
    
    dcomments = {}
    ucomments = {}
    
    dsubmissions = {}
    usubmissions = {}
    
    
    anxiety_members, unanxiety_members = get_all_users()
    
    if disorder == 'anxiety':
        anxiety_members, unanxiety_members = get_anxiety_users()
    elif disorder == 'depression':
        anxiety_members, unanxiety_members = get_depression_users()
        
    earliest = 0
    latest = 0

    for user in anxiety_members:
        subfilename = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'
        dpostdict[user] = 0
        dcommentdict[user] = 0
        dsubmissiondict[user] = 0
       
      
        sublen = 0
        postlen = 0
        comlen = 0
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            
            
            dpostdict[user] +=1
            dsubmissiondict[user] +=1
           
            sublen += len(linelist[9].split())
            sublen += len(linelist[14].split())
            postlen += len(linelist[9].split())
            postlen += len(linelist[14].split())
        dsubmissions[user] = 0.0
        if dsubmissiondict[user] != 0:
            dsubmissions[user] = float(sublen) / float(dsubmissiondict[user])
            
        
        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            
            dpostdict[user] +=1
            dcommentdict[user] +=1
            comlen += len(linelist[2].split())
            postlen += len(linelist[2].split())
        dcomments[user] = 0.0
        if dcommentdict[user] != 0:
            dcomments[user] = float(comlen)/ float(dcommentdict[user])
        dposts[user] = 0.0
        if dpostdict[user]!= 0:
            dposts[user] = float(postlen)/float(dpostdict[user])
        comfile.close()

    for user in unanxiety_members:
        upostdict[user] = 0
        ucommentdict[user] = 0
        usubmissiondict[user] = 0
       
      
        
        sublen = 0
        postlen = 0
        comlen = 0
        
        subfilename = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        post_num = 0
        
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            upostdict[user] += 1
            usubmissiondict[user] +=1
            sublen += len(linelist[9].split())
            sublen += len(linelist[14].split())
            postlen += len(linelist[9].split())
            postlen += len(linelist[14].split())
           
        usubmissions[user] = 0.0
        if usubmissiondict[user] != 0:
            usubmissions[user] = float(sublen) / float(usubmissiondict[user])
           
        subfile.close()

        comfilename = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            upostdict[user] += 1
            ucommentdict[user] +=1
            comlen += len(linelist[2].split())
            postlen += len(linelist[2].split())
        comfile.close()
        ucomments[user] = 0.0
        if ucommentdict[user] != 0:
            ucomments[user] = float(comlen)/ float(ucommentdict[user])
        uposts[user] = 0.0
        if upostdict[user]!= 0:
            uposts[user] = float(postlen)/float(upostdict[user])
        
    if option == 'comments':
        return dcomments, ucomments
    elif option == 'submissions':
        return dsubmissions, usubmissions
    else:
        return dposts, uposts
    
def postlen_plain(include, extension):
    status = ''
    
    dposts, uposts = postlen_stuff('', include, '')
    fname1 = 'postlens/' + extension + 'PostLen.pdf'
   # print(f'diagnosed len for {extension} : {len(dposts)}')
   # print(f'undiagnosed len for {extension} : {len(uposts)}')
    k1 = CDF3(dposts.values(), uposts.values(), 'Average Wordcount per Post', fname1, status)
    
    dcomments, ucomments = postlen_stuff('', include, 'comments')
    fname2 = 'postlens/' + extension + 'CommentsLen.pdf'
   # print(f'diagnosed len for {extension} : {len(dcomments)}')
   # print(f'undiagnosed len for {extension} : {len(ucomments)}')
    k2 = CDF3(dcomments.values(), ucomments.values(), 'Average Wordcount per Comment', fname2, status)
    
    dsubmissions, usubmissions = postlen_stuff('', include,'submissions')
    fname3 = 'postlens/' + extension + 'SubmissionsLen.pdf'
   # print(f'diagnosed len for {extension} : {len(dsubmissions)}')
    #print(f'undiagnosed len for {extension} : {len(usubmissions)}')
    k3 = CDF3(dsubmissions.values(), usubmissions.values(), 'Average Wordcount per Submission', fname3, status)
    
    return k1, k2, k3

def postlen_anxiety(include, extension):
    status = 'anxiety'
    
    dposts, uposts = postlen_stuff('anxiety', include, '')
    fname1 = 'postlens/' + extension + 'PostLen_Anxiety.pdf'
   # print(f'diagnosed len for {extension} : {len(dposts)}')
   # print(f'undiagnosed len for {extension} : {len(uposts)}')
    k1 = CDF3(dposts.values(), uposts.values(), 'Average Wordcount per Post', fname1, status)
    
    dcomments, ucomments = postlen_stuff('anxiety', include, 'comments')
    fname2 = 'postlens/' + extension + 'CommentsLen_Anxiety.pdf'
   # print(f'diagnosed len for {extension} : {len(dcomments)}')
   # print(f'undiagnosed len for {extension} : {len(ucomments)}')
    k2 = CDF3(dcomments.values(), ucomments.values(), 'Average Wordcount per Comment', fname2, status)
    
    dsubmissions, usubmissions = postlen_stuff('anxiety', include,'submissions')
    fname3 = 'postlens/' + extension + 'SubmissionsLen_Anxiety.pdf'
   # print(f'diagnosed len for {extension} : {len(dsubmissions)}')
    #print(f'undiagnosed len for {extension} : {len(usubmissions)}')
    k3 = CDF3(dsubmissions.values(), usubmissions.values(), 'Average Wordcount per Submission', fname3, status)
    
    return k1, k2, k3

def postlen_depression(include, extension):
    status = 'depression'
    
    dposts, uposts = postlen_stuff('depression', include, '')
    fname1 = 'postlens/' + extension + 'PostLen_Depression.pdf'
   # print(f'diagnosed len for {extension} : {len(dposts)}')
   # print(f'undiagnosed len for {extension} : {len(uposts)}')
    k1 = CDF3(dposts.values(), uposts.values(), 'Average Wordcount per Post', fname1, status)
    
    dcomments, ucomments = postlen_stuff('depression', include, 'comments')
    fname2 = 'postlens/' + extension + 'CommentsLen_Depression.pdf'
   # print(f'diagnosed len for {extension} : {len(dcomments)}')
   # print(f'undiagnosed len for {extension} : {len(ucomments)}')
    k2 = CDF3(dcomments.values(), ucomments.values(), 'Average Wordcount per Comment', fname2, status)
    
    dsubmissions, usubmissions = postlen_stuff('depression', include,'submissions')
    fname3 = 'postlens/' + extension + 'SubmissionsLen_Depression.pdf'
   # print(f'diagnosed len for {extension} : {len(dsubmissions)}')
    #print(f'undiagnosed len for {extension} : {len(usubmissions)}')
    k3 = CDF3(dsubmissions.values(), usubmissions.values(), 'Average Wordcount per Submission', fname3, status)
    
    return k1, k2, k3

def postnum_plain(include, extension):
    
    status = ''
    
    allposts, dposts, uposts = postnum_stuff('', include, '')
    fname1 = 'postnum/' + extension + 'PostPerUser.pdf'
   # print(f'diagnosed len for {extension} : {len(dposts)}')
   # print(f'undiagnosed len for {extension} : {len(uposts)}')
    k1 = CDF3(dposts.values(), uposts.values(), 'Posts per User', fname1, status)
    
    allcomments, dcomments, ucomments = postnum_stuff('', include, 'comments')
    fname2 = 'postnum/' + extension + 'CommentsPerUser.pdf'
   # print(f'diagnosed len for {extension} : {len(dcomments)}')
   # print(f'undiagnosed len for {extension} : {len(ucomments)}')
    k2 = CDF3(dcomments.values(), ucomments.values(), 'Comments per User', fname2, status)
    
    allsubmissions, dsubmissions, usubmissions = postnum_stuff('', include,'submissions')
    fname3 = 'postnum/' + extension + 'SubmissionsPerUser.pdf'
   # print(f'diagnosed len for {extension} : {len(dsubmissions)}')
    #print(f'undiagnosed len for {extension} : {len(usubmissions)}')
    k3 = CDF3(dsubmissions.values(), usubmissions.values(), 'Submissions per User', fname3, status)
    
    return k1, k2, k3
    
def postnum_anxiety(include, extension):
    
    status = 'anxiety'
    
    allposts, dposts, uposts = postnum_stuff('anxiety', include, '')
    fname1 = 'postnum/' + extension + 'PostPerUser_anxiety.pdf'
  #  print(f'anxiety len for {extension} : {len(dposts)}')
   # print(f'unanxiety len for {extension} : {len(uposts)}')
    k1 = CDF3(dposts.values(), uposts.values(), 'Posts per User', fname1, status)
    
    allcomments, dcomments, ucomments = postnum_stuff('anxiety', include, 'comments')
    fname2 = 'postnum/' + extension + 'CommentsPerUser_anxiety.pdf'
  #  print(f'anxiety len for {extension} : {len(dcomments)}')
  #  print(f'unanxiety len for {extension} : {len(ucomments)}')
    k2 = CDF3(dcomments.values(), ucomments.values(), 'Comments per User', fname2, status)
    
    
    allsubmissions, dsubmissions, usubmissions = postnum_stuff('anxiety', include,'submissions')
    fname3 = 'postnum/' + extension + 'SubmissionsPerUser_anxiety.pdf'
  #  print(f'anxiety len for {extension} : {len(dsubmissions)}')
  #  print(f'unanxiety len for {extension} : {len(usubmissions)}')
    k3 = CDF3(dsubmissions.values(), usubmissions.values(), 'Submissions per User', fname3, status)
    
    return k1, k2, k3
    

def postnum_depression(include, extension):
    
    status = 'depression'
  
    
    allposts, dposts, uposts = postnum_stuff('depression', include, '')
    fname1 = 'postnum/' + extension + 'PostPerUser_depression.pdf'
    
  #  print(f'depressed len for {extension} : {len(dposts)}')
   # print(f'undepressed len for {extension} : {len(uposts)}')
    k1 = CDF3(dposts.values(), uposts.values(), 'Posts per User', fname1, status)
    
  
    allcomments, dcomments, ucomments = postnum_stuff('depression', include, 'comments')
    fname2 = 'postnum/' + extension + 'CommentsPerUser_depression.pdf'
    #print(f'depressed len for {extension} : {len(dcomments)}')
   # print(f'undepressed len for {extension} : {len(ucomments)}')
    k2 = CDF3(dcomments.values(), ucomments.values(), 'Comments per User', fname2, status)
    
    
    allsubmissions, dsubmissions, usubmissions = postnum_stuff('depression', include,'submissions')
    fname3 = 'postnum/' + extension + 'SubmissionsPerUser_depression.pdf'
  #  print(f'depressed len for {extension} : {len(dsubmissions)}')
    #print(f'undepressed len for {extension} : {len(usubmissions)}')
    k3 = CDF3(dsubmissions.values(), usubmissions.values(), 'Submissions per User', fname3, status)
    
    return k1, k2, k3
    
def postnum_helper():
    extension_list = ['all','exclusive','mental','physical','hobbies','general','topical']
    alist, elist, mlist, plist, hlist, glist, tlist = get_lists()
    ksfileplain = open('postnum/ksplain.txt', 'w')
    ksfileanxiety = open('postnum/ksanxiety.txt','w')
    ksfiledepression = open('postnum/ksdepression.txt', 'w')
    
    k1,k2, k3 = postnum_plain(alist, 'all')
    ks_string = 'All' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postnum_plain(elist, 'exclusive')
    ks_string = 'Exclusive' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postnum_plain(mlist, 'mental')
    ks_string = 'Mental' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postnum_plain(plist, 'physical')
    ks_string = 'Physical' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postnum_plain(hlist, 'hobbies')
    ks_string = 'Hobbies' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postnum_plain(glist, 'general')
    ks_string = 'General' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postnum_plain(tlist, 'topical')
    ks_string = 'Topical' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    
    
    
    k1,k2, k3 = postnum_anxiety(alist, 'all')
    ks_string = 'All' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postnum_anxiety(elist,'exclusive')
    ks_string = 'Exclusive' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postnum_anxiety(mlist, 'mental')
    ks_string = 'Mental' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postnum_anxiety(plist, 'physical')
    ks_string = 'Physical' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postnum_anxiety(hlist, 'hobbies')
    ks_string = 'Hobbies' +':;' + str(k1.pvalue) + ':;'+str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postnum_anxiety(glist, 'general')
    ks_string = 'General' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postnum_anxiety(tlist, 'topical')
    ks_string = 'Topical' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    
    k1,k2, k3 = postnum_depression(alist, 'all')
    ks_string = 'All' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postnum_depression(elist,'exclusive')
    ks_string = 'Exclusive' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postnum_depression(mlist, 'mental')
    ks_string = 'Mental' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postnum_depression(plist, 'physical')
    ks_string = 'Physical' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postnum_depression(hlist, 'hobbies')
    ks_string = 'Hobbies' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postnum_depression(glist, 'general')
    ks_string = 'General' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postnum_depression(tlist, 'topical')
    ks_string = 'Topical' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    
    ksfileplain.close()
    ksfileanxiety.close()
    ksfiledepression.close()

def postlen_helper():
    extension_list = ['all','exclusive','mental','physical','hobbies','general','topical']
    alist, elist, mlist, plist, hlist, glist, tlist = get_lists()
    ksfileplain = open('postlens/ksplain.txt', 'w')
    ksfileanxiety = open('postlens/ksanxiety.txt','w')
    ksfiledepression = open('postlens/ksdepression.txt', 'w')
    
    k1,k2, k3 = postlen_plain(alist, 'all')
    ks_string = 'All' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postlen_plain(elist, 'exclusive')
    ks_string = 'Exclusive' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postlen_plain(mlist, 'mental')
    ks_string = 'Mental' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postlen_plain(plist, 'physical')
    ks_string = 'Physical' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postlen_plain(hlist, 'hobbies')
    ks_string = 'Hobbies' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postlen_plain(glist, 'general')
    ks_string = 'General' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    k1,k2, k3 = postlen_plain(tlist, 'topical')
    ks_string = 'Topical' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileplain.write(ks_string)
    
    
    
    k1,k2, k3 = postlen_anxiety(alist, 'all')
    ks_string = 'All' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postlen_anxiety(elist,'exclusive')
    ks_string = 'Exclusive' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postlen_anxiety(mlist, 'mental')
    ks_string = 'Mental' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postlen_anxiety(plist, 'physical')
    ks_string = 'Physical' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postlen_anxiety(hlist, 'hobbies')
    ks_string = 'Hobbies' +':;' + str(k1.pvalue) + ':;'+str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postlen_anxiety(glist, 'general')
    ks_string = 'General' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    k1,k2, k3 = postlen_anxiety(tlist, 'topical')
    ks_string = 'Topical' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfileanxiety.write(ks_string)
    
    k1,k2, k3 = postlen_depression(alist, 'all')
    ks_string = 'All' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postlen_depression(elist,'exclusive')
    ks_string = 'Exclusive' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postlen_depression(mlist, 'mental')
    ks_string = 'Mental' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postlen_depression(plist, 'physical')
    ks_string = 'Physical' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postlen_depression(hlist, 'hobbies')
    ks_string = 'Hobbies' +':;' + str(k1.pvalue) + ':;'+ str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postlen_depression(glist, 'general')
    ks_string = 'General' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;' +str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    k1,k2, k3 = postlen_depression(tlist, 'topical')
    ks_string = 'Topical' +':;' + str(k1.pvalue) + ':;' +str(k2.pvalue) + ':;'+ str(k3.pvalue) + '\n'
    ksfiledepression.write(ks_string)
    
    ksfileplain.close()
    ksfileanxiety.close()
    ksfiledepression.close()

def posts_timeseries(disorder):

    anxiety_members, unanxiety_members = get_all_users()
    
    if disorder == 'anxiety':
        anxiety_members, unanxiety_members = get_anxiety_users()
    elif disorder == 'depression':
        anxiety_members, unanxiety_members = get_depression_users()
        
  
    posttimes = []
    dposttimes = []
    uposttimes = []

    for user in anxiety_members:
        subfilename = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            
            posttime = linelist[2]
            dposttimes.append(posttime)
            posttimes.append(posttime)
            
        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue

            posttime = linelist[3]
            dposttimes.append(posttime)
            posttimes.append(posttime)

        comfile.close()


    for user in unanxiety_members:
        subfilename = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'

        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue

            posttime = linelist[2]
            uposttimes.append(posttime)
            posttimes.append(posttime)

        subfile.close()

        comfilename = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue

            posttime = linelist[3]
            uposttimes.append(posttime)
            posttimes.append(posttime)
            
        comfile.close()
    return dposttimes, uposttimes
    
def posttimestuff3(disorder, include_list):

    posts = 0
    dposts = 0
    uposts = 0

    anxiety_members, unanxiety_members = get_all_users()
    
    if disorder == 'anxiety':
        anxiety_members, unanxiety_members = get_anxiety_users()
    elif disorder == 'depression':
        anxiety_members, unanxiety_members = get_depression_users()
        
    earliest = 0
    latest = 0
    posttimes = []
    dposttimes = []
    uposttimes = []
    dusertimes = {}
    uusertimes = {}

    for user in anxiety_members:
        subfilename = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'

        post_num = 0
        dusertimes[user] = dict.fromkeys(range(24), 0)
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            
            posts += 1
            dposts += 1
            posttime = linelist[2]
            dposttimes.append(posttime)
            posttimes.append(posttime)
            dtime = datetime.fromtimestamp(int(float(posttime)))
            dhour = dtime.hour
            dusertimes[user][dhour] +=1
        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            posts += 1
            dposts += 1
            posttime = linelist[3]
            dposttimes.append(posttime)
            posttimes.append(posttime)
            dtime = datetime.fromtimestamp(int(float(posttime)))
            dhour = dtime.hour
            dusertimes[user][dhour] += 1

        comfile.close()


    for user in unanxiety_members:
        subfilename = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        post_num = 0
        uusertimes[user] = dict.fromkeys(range(24), 0)
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in include_list:
                continue
            posttime = linelist[2]
            posts += 1
            uposts += 1
            uposttimes.append(posttime)
            posttimes.append(posttime)
            utime = datetime.fromtimestamp(int(float(posttime)))
            uhour = utime.hour
            uusertimes[user][uhour] +=1
        subfile.close()

        comfilename = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in include_list:
                continue
            posttime = linelist[3]
            posts += 1
            uposts += 1
            uposttimes.append(posttime)
            posttimes.append(posttime)
            utime = datetime.fromtimestamp(int(float(posttime)))
            uhour = utime.hour
            uusertimes[user][uhour] +=1
        comfile.close()
    return dusertimes, uusertimes

def posttime_norming(dusertimes, uusertimes):
    diagnosed_norms = []
    undiagnosed_norms = []

    for user in dusertimes:
        dsum = sum(dusertimes[user].values())
        dnorm = []
        if int(dsum) == 0:
            print(user)
            for j in range(24):
                print(dusertimes[user][j])
            break
                
        dnorm = [float(dusertimes[user][i])/float(dsum) for i in range(24)]
        diagnosed_norms.append(dnorm)
    for user in uusertimes:
        usum = sum(uusertimes[user].values())
        if(usum == 0):
            print(user)
            for j in range(24):
                print(uusertimes[user][j])
            break
        unorm = [float(uusertimes[user][i])/float(usum) for i in range(24)]
        undiagnosed_norms.append(unorm)
        
    diagnosed_totals = dict.fromkeys(range(24), 0.0)
    undiagnosed_totals = dict.fromkeys(range(24), 0.0)
    unusers = len(undiagnosed_norms)
    dusers = len(diagnosed_norms)
    for userlist in diagnosed_norms:
        for i in range(24):
            diagnosed_totals[i] += userlist[i]
    for userlist in undiagnosed_norms:
        for i in range(24):
            undiagnosed_totals[i] += userlist[i]

    for j in range(24):
        diagnosed_totals[j] = diagnosed_totals[j] / float(dusers)
        undiagnosed_totals[j] = undiagnosed_totals[j] / float(unusers)

    return diagnosed_totals, undiagnosed_totals



    

def diurnal_plot(toplot1, toplot2, disorder, filename):
    y = np.arange(24) 
    # plotting
    plt.clf()
    plt.xlabel('Hour')
    plt.ylabel('Post Proportion')

    #plt.title('Diurnal Average Posts per Hour')
    dlabel = 'Diagnosed'
    ulabel = 'Undiagnosed'
    
    if disorder == 'anxiety':
        dlabel = 'Anxious'
        ulabel = 'Not Anxious'
    elif disorder == 'depression':
        dlabel = 'Depressed'
        ulabel = 'Not Depressed'
    
    plt.plot(y, toplot1, label = dlabel)
    plt.plot(y, toplot2, label = ulabel)
    plt.legend()
    plt.savefig(filename, facecolor = 'white')
    return ks_2samp(toplot1, toplot2)

def diurnal_three(toplot1, toplot2, toplot3, filename):
    y = np.arange(24) 
    # plotting
    plt.clf()
    plt.xlabel('Hour')
    plt.ylabel('Post Proportion')

    #plt.title('Diurnal Average Posts per Hour')
    dlabel = 'Diagnosed'
    ulabel = 'Undiagnosed'
    plt.plot(y, toplot1, label = 'Diagnosed')
    plt.plot(y, toplot2, label = 'Anxious')
    plt.plot(y, toplot3, label = 'Depressed')
    plt.legend()
    plt.savefig(filename, facecolor = 'white')
    toplot1 = np.sort(toplot1)
    toplot2 = np.sort(toplot2)
    toplot3 = np.sort(toplot3)
    k1 = ks_2samp(toplot1, toplot2)
    k2 = ks_2samp(toplot3, toplot2)
    k3 = ks_2samp(toplot1, toplot3)
    return k1, k2, k3
    
#calls the other methods in this block
def posttime_helper(all_list):
    #diagnosed
    d1, u1 = posttimestuff3('', all_list)
    dnorm1, unorm1 = posttime_norming(d1, u1)
    #anxiety
    d2, u2 = posttimestuff3('anxiety', all_list)
    dnorm2, unorm2 = posttime_norming(d2, u2)
    #depression
    d3, u3 = posttimestuff3('depression', all_list)
    dnorm3, unorm3 = posttime_norming(d3, u3)
    
    
    k1, k2, k3 = diurnal_three(dnorm1.values(), dnorm2.values(), dnorm3.values(), 'diurnal/threeway_all.pdf')
    print(k1)
    print(k2)
    print(k3)
    kval = diurnal_plot(dnorm1.values(), unorm1.values(),'', 'diurnal/diagnosed_all.pdf')
    print(kval)
    kval = diurnal_plot(dnorm2.values(), unorm2.values(), 'anxiety','diurnal/anxiety_all.pdf')
    print(kval)
    kval = diurnal_plot(dnorm3.values(), unorm3.values(), 'depression', 'diurnal/depression_all.pdf')
    print(kval)


    
def sorted_days(dposttimes2, uposttimes2):
    from datetime import datetime


   # postdates = {}
    dpostdates = {}
    upostdates = {}

  #  posttimes = np.sort(posttimes)
    dposttimes = np.sort(dposttimes2)
    uposttimes = np.sort(uposttimes2)

   # for p in posttimes:
   #     dt = datetime.fromtimestamp(int(float(p)))
   #     dt = dt.date()
   #     if dt in postdates:
     #       postdates[dt] += 1
   #     else:
     #       postdates[dt] = 1
    for p in dposttimes:
        dt = datetime.fromtimestamp(int(float(p)))
        dt = dt.date()
        if dt in dpostdates:
            dpostdates[dt] += 1
        else:
            dpostdates[dt] = 1

    for p in uposttimes:
        dt = datetime.fromtimestamp(int(float(p)))
        dt = dt.date()
        if dt in upostdates:
            upostdates[dt] += 1
        else:
            upostdates[dt] = 1
            
    return dpostdates, upostdates
            
def plot_dates(date_dict,filename):
 
    test = pd.Series(date_dict)

    fig, ax = pyplot.subplots()
    fig.autofmt_xdate()
    ax.plot(test)
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Posts')
    ax.set_title
    #pyplot.show()
    pyplot.savefig(filename, facecolor = 'white')
    
def plot_dates2(date_dict1, date_dict2, filename):
    test = pd.Series(date_dict)
    test2 = pd.Series(date_dict2)
    
    fig, ax = pyplot.subplots()
    fig.autofmt_xdate()
    ax.plot(test)
    ax.plot(test2)
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Posts')

    #pyplot.show()
    pyplot.savefig(filename, facecolor = 'white')

    
def plot_datesv2(date_dict1, date_dict2, filename):
    test = pd.Series(date_dict1)
    test2 = pd.Series(date_dict2)
    dlabel = 'Diagnosed'
    ulabel = 'Undiagnosed'
    
    fig, ax = plt.subplots()
    fig.autofmt_xdate()
    
    plt.plot(test, label = dlabel)
    plt.plot(test2, label = ulabel)
    plt.legend(loc='best')
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Posts')

    #pyplot.show()
    plt.savefig(filename, facecolor = 'white')
    return ks_2samp(test, test2)



In [None]:
posttime_helper(all_list)

In [None]:

dplain, uplain = posts_timeseries('')
dplainsorted, uplainsorted = sorted_days(dplain, uplain)
ks1 = plot_datesv2(dplainsorted, uplainsorted, 'diurnal/timeseries_plain.pdf')

danxiety, uanxiety= posts_timeseries('anxiety')
danxietysorted, uanxietysorted = sorted_days(danxiety, uanxiety)
ks2= plot_datesv2(danxietysorted, uanxietysorted, 'diurnal/timeseries_anxiety.pdf')

ddepression, udepression = posts_timeseries('depression')
ddepressionsorted, udepressionsorted = sorted_days(ddepression, udepression)
ks3 = plot_datesv2(ddepressionsorted, udepressionsorted, 'diurnal/timeseries_depression.pdf')

print(ks1)
print(ks2)
print(ks3)

In [None]:
postlen_helper()

In [None]:
posttime_helper(all_list)

In [None]:
postnum_helper()

In [None]:
#post time, total number of post, time series stuff

from datetime import datetime

posts = 0
dposts = 0
uposts = 0


anxiety_members, unanxiety_members = get_all_users()
earliest = 0
latest = 0
posttimes = []
dposttimes = []
uposttimes = []
dusertimes = {}
uusertimes = {}


for user in anxiety_members:
    subfilename = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfilename = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'
    
    post_num = 0
    dusertimes[user] = dict.fromkeys(range(24), 0)
    subfile = open(subfilename, 'r')
    for line in subfile:
        
        linelist = line.split(':;')
        if len(linelist) != 15:
            continue
        posts += 1
        dposts += 1
        posttime = linelist[2]
        dposttimes.append(posttime)
        posttimes.append(posttime)
        dtime = datetime.fromtimestamp(int(float(posttime)))
        dhour = dtime.hour
        dusertimes[user][dhour] +=1
    subfile.close()
    
    comfile = open(comfilename, 'r')
    for line in comfile:
        linelist = line.split(':;')
        if len(linelist) != 12:
            continue
        posts += 1
        dposts += 1
        posttime = linelist[3]
        dposttimes.append(posttime)
        posttimes.append(posttime)
        dtime = datetime.fromtimestamp(int(float(posttime)))
        dhour = dtime.hour
        dusertimes[user][dhour] += 1
       
    comfile.close()
 
    
for user in unanxiety_members:
    subfilename = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
    post_num = 0
    uusertimes[user] = dict.fromkeys(range(24), 0)
    subfile = open(subfilename, 'r')
    for line in subfile:
        
        linelist = line.split(':;')
        if len(linelist) != 15:
            continue
        posttime = linelist[2]
        posts += 1
        uposts += 1
        uposttimes.append(posttime)
        posttimes.append(posttime)
        utime = datetime.fromtimestamp(int(float(posttime)))
        uhour = utime.hour
        uusertimes[user][uhour] +=1
    subfile.close()
    
    comfilename = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
    comfile = open(comfilename, 'r')
    for line in comfile:
        linelist = line.split(':;')
        if len(linelist) != 12:
            continue
        posttime = linelist[3]
        posts += 1
        uposts += 1
        uposttimes.append(posttime)
        posttimes.append(posttime)
        utime = datetime.fromtimestamp(int(float(posttime)))
        uhour = utime.hour
        uusertimes[user][uhour] +=1
    comfile.close()

In [None]:
diagnosed_norms = []
undiagnosed_norms = []

for user in dusertimes:
    dsum = sum(dusertimes[user].values())
    dnorm = [float(dusertimes[user][i])/float(dsum) for i in range(24)]
    diagnosed_norms.append(dnorm)
for user in uusertimes:
    usum = sum(uusertimes[user].values())
    if(usum == 0):
        print(user)
        break
    unorm = [float(uusertimes[user][i])/float(usum) for i in range(24)]
    undiagnosed_norms.append(unorm)


In [None]:
posttimes = np.sort(posttimes)
earliest = posttimes[0]
latest = posttimes[-1]

#convert from 
print(earliest)
print(latest)

In [None]:
print(posts)
print(dposts)
print(uposts)

In [None]:
from datetime import datetime
timestamp = 1632497205.0
dt = datetime.fromtimestamp(timestamp)
print("Test year:", dt.year)
print("Test month:", dt.month)
print("Test day:", dt.day)
print("Test time:", dt.hour)
print("Test time:", dt.minute)
print("Test time:", dt.second)
print(dt.date())

In [None]:
from datetime import datetime
dhours = [] #hours of posts for all diagnosed users at once
uhours = [] #hours of posts for all undiagnosed users at once
for t in dposttimes:
    dt = datetime.fromtimestamp(timestamp)
    hr = dt.hour
    dhours.append(hr)
for t in uposttimes:
    dt = datetime.fromtimestamp(timestamp)
    hr = dt.hour
    uhours.append(hr)

In [None]:
from datetime import datetime


postdates = {}
dpostdates = {}
upostdates = {}

posttimes = np.sort(posttimes)
dposttimes = np.sort(dposttimes)
uposttimes = np.sort(uposttimes)

for p in posttimes:
    dt = datetime.fromtimestamp(int(float(p)))
    dt = dt.date()
    if dt in postdates:
        postdates[dt] += 1
    else:
        postdates[dt] = 1
for p in dposttimes:
    dt = datetime.fromtimestamp(int(float(p)))
    dt = dt.date()
    if dt in dpostdates:
        dpostdates[dt] += 1
    else:
        dpostdates[dt] = 1
        
for p in uposttimes:
    dt = datetime.fromtimestamp(int(float(p)))
    dt = dt.date()
    if dt in upostdates:
        upostdates[dt] += 1
    else:
        upostdates[dt] = 1
    

In [None]:
print(len(postdates))
print(postdates[-1])

In [None]:
from datetime import datetime

def get_posttimes(anxiety_members, unanxiety_members):
    #post time, total number of post, time series stuff
    
    dusertimes = {}
    uusertimes = {}
    
    dpostnums = {}
    upostnums = {}
    davgs = {}
    uavgs = {}


    for user in anxiety_members:
        earliest = float('inf')
        latest = 0.0
        num_posts = 0
        time_list = []
        
        subfilename = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'

       
        dusertimes[user] = dict.fromkeys(range(24), 0)
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            num_posts += 1
         
            posttime = linelist[2]
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime
            dtime = datetime.fromtimestamp(int(float(posttime)))
            dhour = dtime.hour
            dusertimes[user][dhour] +=1
        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            num_posts += 1
            posttime = linelist[3]
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime
           
            dtime = datetime.fromtimestamp(int(float(posttime)))
            dhour = dtime.hour
            dusertimes[user][dhour] += 1

        comfile.close()
        
        dpostnums[user] = num_posts
        early_date = datetime.fromtimestamp(int(float(earliest)))
        latest_date = datetime.fromtimestamp(int(float(latest)))
        delta = latest_date - early_date
        denom = float(delta.days)
        davgs[user] = float(num_posts) / denom


    for user in unanxiety_members:
       
        earliest = float('inf')
        latest = 0.0
        num_posts = 0
        time_list = []
        
        subfilename = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        
        uusertimes[user] = dict.fromkeys(range(24), 0)
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
                
            num_posts += 1
            posttime = linelist[2]
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime
            utime = datetime.fromtimestamp(int(float(posttime)))
            uhour = utime.hour
            uusertimes[user][uhour] +=1
        subfile.close()

       
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            num_posts += 1
            posttime = linelist[3]
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime

            utime = datetime.fromtimestamp(int(float(posttime)))
            uhour = utime.hour
            uusertimes[user][uhour] +=1
        comfile.close()
        upostnums[user] = num_posts
        early_date = datetime.fromtimestamp(int(float(earliest)))
        latest_date = datetime.fromtimestamp(int(float(latest)))
        delta = latest_date - early_date
        denom = float(delta.days)
        uavgs[user] = float(num_posts) / denom
    return dusertimes, uusertimes, dpostnums, upostnums, davgs, uavgs

In [None]:
dlist, ulist = get_all_users()
dtimesdict, utimesdict, dpostnumdict, upostnumdict, davgdict, uavgdict = get_posttimes(dlist, ulist)

sorted_davg = dict(sorted(davgdict.items(), key = lambda x:x[1]))
sorted_uavg = dict(sorted(uavgdict.items(), key = lambda x:x[1]))

print(sorted_davg[0])
print(sorted_davg[1])