In [None]:
#installs necessary libraries

!pip install nltk
!pip install scikit-learn
!pip install pandas!pip install pyspellchecker

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
import os
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline


In [None]:
#useful preprocessing methods

from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
import string
import re

#replaces all white space with a single space between words
def remove_whitespaces(line):
    return " ".join(line.split())

#replaces mispelled words with likely true spellings
def spell_correction(text):
    wordlist = text.split()
    corrected_wordlist = []
    checker = SpellCheck()
    for word in wordlist:
        correct = checker.correction(word)
        corrected_wordlist.append(correct)
    return " ".join(corrected_wordlist)

#Returns version of text with all words converted to stemmed versions
def stemming(text):
    wordlist = text.split()
    corrected_wordlist = []
    porter = PorterStemmer()
    for word in wordlist:
        correct = porter.stem(word)
        corrected_wordlist.append(correct)
    return " ".join(corrected_wordlist)

#removes URLs from text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

#detects if URLs are present in text (can filter out spam posts)
def find_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    if re.match(url_pattern, text):
        return True
    else:
        return False

#removes punctuations from text
def remove_punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

#returns preprocessed version of text (without stemming)
def preprocessv1(text):
    text = remove_urls(text)
    text = remove_punct(text)
    text = remove_whitespaces(text) #removing extra white spaces
    #text = stemming(text)
    text = text.lower() #lowercasing

    return text

#returns preprocessed version of text (WITH stemming)
def preprocessv2(text):
    text = remove_urls(text)
    text = remove_punct(text)
    text = remove_whitespaces(text) #removing extra white spaces
    text = stemming(text)
    text = text.lower() #lowercasing
    return text

In [None]:
def get_stemmed_data(extension):
    dfilename = 'dstemmed_' + extension + '.txt'
    ufilename = 'ustemmed_' + extension + '.txt'
    
    ddict = {}
    udict = {}
    
    dfile = open(dfilename,'r')
    ufile = open(ufilename,'r')
    
    for line in dfile:
        line_list = line.split(':;')
        if len(line_list) != 2:
            print('error')
            break
        ddict[line_list[0]] = line_list[1].strip() 
    
    for line in ufile:
        line_list = line.split(':;')
        if len(line_list) != 2:
            print('error')
            break
        udict[line_list[0]] = line_list[1].strip() 
    
    dfile.close()
    ufile.close()
    
    return ddict, udict

def get_stemmed_data_depression(extension):
    ddict, udict = get_stemmed_data(extension)
    userlist = get_depression_users()
    
    ddict, udict = get_stemmed_data(extension)
    dusers, uusers = get_depression_users()
    
    ddict2 = {}
    udict2 = {}
    for user in dusers:
        if user in ddict:
            ddict2[user] = ddict[user]
    for user in uusers:
        if user in udict:
            udict2[user] = udict[user]
   
    return ddict2, udict2

def get_stemmed_data_anxiety(extension):
    ddict, udict = get_stemmed_data(extension)
    dusers, uusers = get_anxiety_users()
    
    ddict2 = {}
    udict2 = {}
    for user in dusers:
        if user in ddict:
            ddict2[user] = ddict[user]
    for user in uusers:
        if user in udict:
            udict2[user] = udict[user]
   
    return ddict2, udict2

In [None]:
from datetime import datetime, timedelta

def get_posttimes(anxiety_members, unanxiety_members):
    #post time, total number of post, time series stuff
    
    dusertimes = {}
    uusertimes = {}
    
    dpostnums = {}
    upostnums = {}
    davgs = {}
    uavgs = {}


    for user in anxiety_members:
        earliest = float('inf')
        latest = 0.0
        num_posts = 0
        time_list = []
        
        subfilename = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'

       
        dusertimes[user] = dict.fromkeys(range(24), 0)
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            num_posts += 1
         
            posttime = float(linelist[2])
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime
            dtime = datetime.fromtimestamp(int(float(posttime)))
            dhour = dtime.hour
            dusertimes[user][dhour] +=1
        subfile.close()

        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            num_posts += 1
            posttime = float(linelist[3])
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime
           
            dtime = datetime.fromtimestamp(int(float(posttime)))
            dhour = dtime.hour
            dusertimes[user][dhour] += 1

        comfile.close()
        
        dpostnums[user] = num_posts
        early_date = datetime.fromtimestamp(int(float(earliest)))
        latest_date = datetime.fromtimestamp(int(float(latest)))
        delta = latest_date - early_date
        denom = float(delta.days)
        if int(denom) == 0:
            print(f'{user} with {num_posts} has 0 ')
            continue
        davgs[user] = float(num_posts) / denom


    for user in unanxiety_members:
       
        earliest = float('inf')
        latest = 0.0
        num_posts = 0
        time_list = []
        
        subfilename = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfilename = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        
        uusertimes[user] = dict.fromkeys(range(24), 0)
        subfile = open(subfilename, 'r')
        for line in subfile:

            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
                
            num_posts += 1
            posttime = float(linelist[2])
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime
            utime = datetime.fromtimestamp(int(float(posttime)))
            uhour = utime.hour
            uusertimes[user][uhour] +=1
        subfile.close()

       
        comfile = open(comfilename, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            num_posts += 1
            posttime = float(linelist[3])
            time_list.append(posttime)
            if posttime < earliest:
                earliest = posttime
            if posttime > latest:
                latest = posttime

            utime = datetime.fromtimestamp(int(float(posttime)))
            uhour = utime.hour
            uusertimes[user][uhour] +=1
        comfile.close()
        upostnums[user] = num_posts
        early_date = datetime.fromtimestamp(int(float(earliest)))
        latest_date = datetime.fromtimestamp(int(float(latest)))
        delta = latest_date - early_date
        denom = float(delta.days)
        if int(denom) == 0:
            print(f'{user} with {num_posts} has 0 ')
            continue
        uavgs[user] = float(num_posts) / denom
    return dusertimes, uusertimes, dpostnums, upostnums, davgs, uavgs

In [None]:
def get_anxiety_users():
    anxiety_users = []
    unanxiety_users = []
    
    anxietyfile = open('only_anxiety_proven.txt','r')
    for user in anxietyfile:
        anxiety_users.append(user.strip())
   
    unanxietyfile = open('only_anxiety_undiagnosed.txt', 'r')
    for user in unanxietyfile:
        unanxiety_users.append(user.strip())
        
    return anxiety_users, unanxiety_users

def get_anxiety_files():
    only_anxiety_dfiles = []
    only_anxiety_ufiles = []
    
    anxietyfile = open('only_anxiety_proven.txt','r')
    for user in anxietyfile:
    
        subfile = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfile = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
        only_anxiety_dfiles.append(subfile)
        only_anxiety_dfiles.append(comfile)
   
    anxietyfile2 = open('only_anxiety_undiagnosed.txt','r')
    for user in anxietyfile2:

        subfile = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfile = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        only_anxiety_ufiles.append(subfile)
        only_anxiety_ufiles.append(comfile)
        
    return only_anxiety_dfiles, only_anxiety_ufiles

def get_depression_files():
    only_depressed_dfiles = []
    only_depressed_ufiles = []
    
    unanxietyfile = open('only_depressed_proven.txt', 'r')
    for user in unanxietyfile:

        subfile = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfile = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
        only_depressed_dfiles.append(subfile)
        only_depressed_dfiles.append(comfile)
        
    unanxietyfile2 = open('only_depressed_undiagnosed.txt', 'r')
    for user in unanxietyfile2:

        subfile = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfile = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        only_depressed_ufiles.append(subfile)
        only_depressed_ufiles.append(comfile)
    
    return only_depressed_dfiles, only_depressed_ufiles

def get_depression_users():
    depressed_users = []
    undepressed_users = []
    
    unanxietyfile = open('only_depressed_proven.txt', 'r')
    for user in unanxietyfile:
        depressed_users.append(user.strip())
           
    unanxietyfile2 = open('only_depressed_undiagnosed.txt', 'r')
    for user in unanxietyfile2:
        undepressed_users.append(user.strip())
        
    return depressed_users, undepressed_users
       
def get_all_users():
    all_uusers = []
    all_dusers = []
    
    unanxietyfile = open('proven_all_diagnosed.txt', 'r')
    for user in unanxietyfile:
        all_dusers.append(user.strip())
           
    unanxietyfile2 = open('proven_all_undiagnosed.txt', 'r')
    for user in unanxietyfile2:
        all_uusers.append(user.strip())
        
    return all_dusers, all_uusers
       
def get_all_files():
    only_depressed_dfiles = []
    only_depressed_ufiles = []
    
    unanxietyfile = open('proven_all_diagnosed.txt', 'r')
    for user in unanxietyfile:

        subfile = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfile = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
        only_depressed_dfiles.append(subfile)
        only_depressed_dfiles.append(comfile)
        
    unanxietyfile2 = open('proven_all_undiagnosed.txt', 'r')
    for user in unanxietyfile2:

        subfile = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comfile = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        only_depressed_ufiles.append(subfile)
        only_depressed_ufiles.append(comfile)
    
    return only_depressed_dfiles, only_depressed_ufiles

In [None]:
dlist, ulist = get_all_users()
dtimesdict, utimesdict, dpostnumdict, upostnumdict, davgdict, uavgdict = get_posttimes(dlist, ulist)

sorted_davg = sorted(davgdict.items(), key = lambda x:x[1])
sorted_uavg = sorted(uavgdict.items(), key = lambda x:x[1])

print(sorted_davg[0])
print(sorted_davg[-1])

In [None]:
from nltk.stem import PorterStemmer

def stemming(text):
    wordlist = text.split()
    corrected_wordlist = []
    porter = PorterStemmer()
    for word in wordlist:
        correct = porter.stem(word)
        corrected_wordlist.append(correct)
    return " ".join(corrected_wordlist)

def prepare_datasets2(dusers, unusers, stembool):
    diagnosed_dict = {}
    undiagnosed_dict = {}
    
    for user in dusers:
        subname = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'
        user_string = ''
        subfile = open(subname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = stemming(selftext)
            user_string = user_string + selftext
            user_string = user_string + ' '
        subfile.close()
        
        comfile = open(comname, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist)!= 12:
                continue
            body = linelist[2]
            if stembool:
                body = stemming(body)
            user_string = user_string + body
            user_string = user_string + ' '
        
        comfile.close()
        
        diagnosed_dict[user] = user_string
        
    for user in unusers:
        subname = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        user_string = ''
        subfile = open(subname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = stemming(selftext)
            user_string = user_string + selftext
            user_string = user_string + ' '
        subfile.close()
        
        comfile = open(comname, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist)!= 12:
                continue
            body = linelist[2]
            if stembool:
                body = stemming(body)
            user_string = user_string + body
            user_string = user_string + ' '
        
        comfile.close()
        
        undiagnosed_dict[user] = user_string
    
    return diagnosed_dict, undiagnosed_dict

def prepare_datasets2_excluding(dusers, unusers, stembool, excludinglist):
    diagnosed_dict = {}
    undiagnosed_dict = {}
    
    for user in dusers:
        subname = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'
        user_string = ''
        subfile = open(subname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if subreddit in excludinglist:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = stemming(selftext)
            user_string = user_string + selftext
            user_string = user_string + ' '
        subfile.close()
        
        comfile = open(comname, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist)!= 12:
                continue
            
            subreddit = linelist[9]
            if subreddit in excludinglist:
                continue
                
            body = linelist[2]
            if stembool:
                body = stemming(body)
            user_string = user_string + body
            user_string = user_string + ' '
        
        comfile.close()
        
        diagnosed_dict[user] = user_string
        
    for user in unusers:
        subname = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        user_string = ''
        subfile = open(subname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if subreddit in excludinglist:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = stemming(selftext)
            user_string = user_string + selftext
            user_string = user_string + ' '
        subfile.close()
        
        comfile = open(comname, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist)!= 12:
                continue
            
            subreddit = linelist[9]
            if subreddit in excludinglist:
                continue
                
            body = linelist[2]
            if stembool:
                body = stemming(body)
            user_string = user_string + body
            user_string = user_string + ' '
        
        comfile.close()
        
        undiagnosed_dict[user] = user_string
    
    return diagnosed_dict, undiagnosed_dict

def prepare_datasets2_including(dusers, unusers, stembool, includinglist):
    diagnosed_dict = {}
    undiagnosed_dict = {}
    
    for user in dusers:
        subname = 'final_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'final_diagnosed/' + user.strip() + '_comments_stripped.txt'
        user_string = ''
        subfile = open(subname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in includinglist:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = stemming(selftext)
            user_string = user_string + selftext
            user_string = user_string + ' '
        subfile.close()
        
        comfile = open(comname, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist)!= 12:
                continue
            
            subreddit = linelist[9]
            if not subreddit in includinglist:
                continue
                
            body = linelist[2]
            if stembool:
                body = stemming(body)
            user_string = user_string + body
            user_string = user_string + ' '
        
        comfile.close()
        
        diagnosed_dict[user] = user_string
        
    for user in unusers:
        subname = 'final_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'final_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        user_string = ''
        subfile = open(subname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in includinglist:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = stemming(selftext)
            user_string = user_string + selftext
            user_string = user_string + ' '
        subfile.close()
        
        comfile = open(comname, 'r')
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist)!= 12:
                continue
            
            subreddit = linelist[9]
            if not subreddit in includinglist:
                continue
                
            body = linelist[2]
            if stembool:
                body = stemming(body)
            user_string = user_string + body
            user_string = user_string + ' '
        
        comfile.close()
        
        undiagnosed_dict[user] = user_string
    
    return diagnosed_dict, undiagnosed_dict

In [None]:
def prepare_datasets(dusers, unusers, stembool):
    diagnosed_dict = {}
    undiagnosed_dict = {}
    temp_list = [] #delete
    for user in dusers:
        user_string = ''
        subname = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
        subfile = open(subname, 'r')
        comfile = open(comname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                if not user in temp_list:
                    temp_list.append(user)
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = preprocessv2(selftext)
            else:
                selftext = preprocessv1(selftext)
            user_string = user_string + selftext 
            user_string = user_string + ' '
        
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                if not user in temp_list:
                    temp_list.append(user)
                continue
            body  = linelist[2]
            if stembool:
                body = preprocessv2(body)
            else:
                body = preprocessv1(body)
            user_string = user_string + body
            user_string = user_string + ' '
        diagnosed_dict[user] = user_string
        subfile.close()
        comfile.close()
    
    for user in unusers:
        user_string = ''
        subname = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        subfile = open(subname, 'r')
        comfile = open(comname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = preprocessv2(selftext)
            else:
                selftext = preprocessv1(selftext)
           
            user_string = user_string + selftext 
            user_string = user_string + ' '
        
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            body  = linelist[2]
            if stembool:
                body = preprocessv2(body)
            else:
                body = preprocessv1(body)
            user_string = user_string + body
            user_string = user_string + ' '
        undiagnosed_dict[user] = user_string
        subfile.close()
        comfile.close()
    print(len(temp_list))
    print(temp_list)
    return diagnosed_dict, undiagnosed_dict

def prepare_datasets_exclude(dusers, unusers, stembool, excluding):
    diagnosed_dict = {}
    undiagnosed_dict = {}
    
    for user in dusers:
        user_string = ''
        subname = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
        subfile = open(subname, 'r')
        comfile = open(comname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if subreddit in excluding:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = preprocessv2(selftext)
            else:
                selftext = preprocessv1(selftext)
            user_string = user_string + selftext 
            user_string = user_string + ' '
        
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if subreddit in excluding:
                continue
            body  = linelist[2]
            if stembool:
                body = preprocessv2(body)
            else:
                body = preprocessv1(body)
            user_string = user_string + body
            user_string = user_string + ' '
        diagnosed_dict[user] = user_string
        subfile.close()
        comfile.close()
    
    for user in unusers:
        user_string = ''
        subname = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        subfile = open(subname, 'r')
        comfile = open(comname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = preprocessv2(selftext)
            else:
                selftext = preprocessv1(selftext)
           
            user_string = user_string + selftext 
            user_string = user_string + ' '
        
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            body  = linelist[2]
            if stembool:
                body = preprocessv2(body)
            else:
                body = preprocessv1(body)
            user_string = user_string + body
            user_string = user_string + ' '
        undiagnosed_dict[user] = user_string
        subfile.close()
        comfile.close()
    return diagnosed_dict, undiagnosed_dict
    
def prepare_datasets_include(dusers, uunsers, stembool, including):
    diagnosed_dict = {}
    undiagnosed_dict = {}
    
    for user in dusers:
        user_string = ''
        subname = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
        subfile = open(subname, 'r')
        comfile = open(comname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            subreddit = linelist[10]
            if not subreddit in including:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = preprocessv2(selftext)
            else:
                selftext = preprocessv1(selftext)
            user_string = user_string + selftext 
            user_string = user_string + ' '
        
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            subreddit = linelist[9]
            if not subreddit in including:
                continue
            body  = linelist[2]
            if stembool:
                body = preprocessv2(body)
            else:
                body = preprocessv1(body)
            user_string = user_string + body
            user_string = user_string + ' '
        diagnosed_dict[user] = user_string
        subfile.close()
        comfile.close()
    
    for user in unusers:
        user_string = ''
        subname = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
        comname = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
        subfile = open(subname, 'r')
        comfile = open(comname, 'r')
        for line in subfile:
            linelist = line.split(':;')
            if len(linelist) != 15:
                continue
            selftext = linelist[9]
            selftext += ' '
            selftext += linelist[14]
            if stembool:
                selftext = preprocessv2(selftext)
            else:
                selftext = preprocessv1(selftext)
           
            user_string = user_string + selftext 
            user_string = user_string + ' '
        
        for line in comfile:
            linelist = line.split(':;')
            if len(linelist) != 12:
                continue
            body  = linelist[2]
            if stembool:
                body = preprocessv2(body)
            else:
                body = preprocessv1(body)
            user_string = user_string + body
            user_string = user_string + ' '
        undiagnosed_dict[user] = user_string
        subfile.close()
        comfile.close()
    return diagnosed_dict, undiagnosed_dict

def prepare_dataframe(ddict, udict):
    undiagnosed_frame1 = pd.DataFrame(udict, index = [0])
    utframe1 = undiagnosed_frame1.T
    uclass = [0] * len(utframe1)
    utframe1[1] = uclass

    diagnosed_frame1 = pd.DataFrame(ddict, index = [0])
    dtframe1 = diagnosed_frame1.T
    dclass = [1] * len(dtframe1)
    dtframe1[1] = dclass
    
    totalframe = dtframe1.append(utframe1)
    print(f'dframe size = {dtframe1.size}')
    print(f'utframe size = {utframe1.size}')
    print(f'totalframe size = {totalframe.size}')
    return totalframe

def get_stemmed_data(extension):
    dfilename = 'dstemmed_' + extension + '.txt'
    ufilename = 'ustemmed_' + extension + '.txt'
    
    ddict = {}
    udict = {}
    
    dfile = open(dfilename,'r')
    ufile = open(ufilename,'r')
    
    for line in dfile:
        line_list = line.split(':;')
        if len(line_list) != 2:
            print('error')
            break
        ddict[line_list[0]] = line_list[1].strip() 
    
    for line in ufile:
        line_list = line.split(':;')
        if len(line_list) != 2:
            print('error')
            break
        udict[line_list[0]] = line_list[1].strip() 
    
    dfile.close()
    ufile.close()
    
    return ddict, udict

In [None]:
#block with model methods

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

def MB_hypertune(text_clf, xtrain,ytrain):
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1, 1e-1, 1e-2, 1e-3),}
    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring = 'f1')
    gs_clf = gs_clf.fit(xtrain, ytrain)
    print(gs_clf.best_score_)
    print(gs_clf.best_params_)
    return gs_clf.best_score_ ,gs_clf.best_params_

def MB_hypertune2(text_clf, xtrain,ytrain):
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'clf__alpha': (1, 1e-1, 1e-2, 1e-3),}
    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring = 'f1')
    gs_clf = gs_clf.fit(xtrain, ytrain)
    print(gs_clf.best_score_)
    print(gs_clf.best_params_)
    return gs_clf.best_score_ ,gs_clf.best_params_

def SGD_hypertune(text_clf, xtrain, ytrain):
    parameters = {'vect__ngram_range': [(1,1), (1,2)], 'tfidf__use_idf': (True, False),'clf__alpha': (1, 1e-1, 1e-2, 1e-3), 'clf__loss': ('hinge', 'modified_huber', 'squared_hinge', 'perceptron')}
    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring = 'f1')
    gs_clf = gs_clf.fit(xtrain, ytrain)
    print(gs_clf.best_score_)
    print(gs_clf.best_params_)
    return gs_clf.best_score_ ,gs_clf.best_params_

def SGD_hypertune2(text_clf, xtrain, ytrain):
    parameters = {'vect__ngram_range': [(1,1), (1,2)],'clf__alpha': (1, 1e-1, 1e-2, 1e-3), 'clf__loss': ('hinge', 'modified_huber', 'squared_hinge', 'perceptron')}
    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring = 'f1')
    gs_clf = gs_clf.fit(xtrain, ytrain)
    print(gs_clf.best_score_)
    print(gs_clf.best_params_)
    return gs_clf.best_score_ ,gs_clf.best_params_

def MNB_custom(dframe, vectorizer):
    text_MLB = Pipeline(([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()) ]))
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], test_size = 0.34, random_state = 41, shuffle = True)
    ytrainsplit = {}
    ytestsplit = {}
    for num in ytrain:
        if num not in ytrainsplit:
            ytrainsplit[num] = 1
        else:
            ytrainsplit[num] += 1
    for num in ytest:
        if num not in ytestsplit:
            ytestsplit[num] = 1
        else:
            ytestsplit[num] += 1
    print('y train split')
    print(ytrainsplit)
    print('y test split')
    print(ytestsplit)
    xtrain2 = vectorizer.fit_transform(xtrain)
    xtest2 = vectorizer.transform(xtest)
    MNBmodel = MultinomialNB()
    
    MNBmodel.fit(xtrain2, ytrain)
    text_MLB.fit(xtrain, ytrain)
    best_score, best_params = MB_hypertune(text_MLB,xtrain, ytrain)
    #evaluation
    labels = ['Diagnosed', 'Undiagnosed']
    scores = cross_val_score(text_MLB, xtest, ytest, cv=10)
    acc = scores.mean()
    print("10-fold Cross Validation Accuracy: %0.2f percent" % (acc *100))
    
   # predictions = test_MLB.predict(xtest)
    predictions = MNBmodel.predict(xtest2)
    predicto = {}
    for num in predictions:
        if num not in predicto:
            predicto[num] = 1
        else:
            predicto[num] += 1
    print('predicto')
    print(predicto)
    
    print("\nConfusion matrix")
    print(confusion_matrix(ytest, predictions))
    
    print("\nClassification report")
    print(classification_report(ytest, predictions))
   
    
def MNB_count(dframe):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=200000, stop_words='english')
    return MNB_custom(dframe, vectorizer)
def MNB_tfidf(dframe):
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=200000, stop_words='english')
    return MNB_custom(dframe, vectorizer)

def SGD_custom(dframe, vectorizer):
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], shuffle = True)
    xtrain2 = vectorizer.fit_transform(xtrain)
    xtest2 = vectorizer.transform(xtest)
   # SGDmodel = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(max_iter = 1000,tol=1e-3))
    SGDmodel = SGDClassifier(max_iter = 1000,tol=1e-3)
    SGDmodel.fit(xtrain2, ytrain)
    #evaluation
    labels = ['Diagnosed', 'Undiagnosed']
    scores = cross_val_score(SGDmodel, xtest2, ytest, cv=10)
    acc = scores.mean()
    print("10-fold Cross Validation Accuracy: %0.2f percent" % (acc *100))
    
    predictions = SGDmodel.predict(xtest2)
    print("\nConfusion matrix")
    print(confusion_matrix(ytest, predictions))
    
    print("\nClassification report")
    print(classification_report(ytest, predictions))

def SGD_count(dframe):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=200000, stop_words='english')
    return SGD_custom(dframe, vectorizer)
def SGD_tfidf(dframe):
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=200000, stop_words='english')
    return SGD_custom(dframe, vectorizer)

def MNB(dframe, tfbool, ngrams, alpha_value):
    if tfbool:
        text_MLB = Pipeline(([('vect', CountVectorizer(ngram_range = ngrams)), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB(alpha = alpha_value)) ]))
    else:
        text_MLB = Pipeline(([('vect', CountVectorizer(ngram_range = ngrams)), ('clf', MultinomialNB(alpha = alpha_value)) ]))
    
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], test_size = 0.34, random_state = 41, shuffle = True)
    ytrainsplit = {}
    ytestsplit = {}
    for num in ytrain:
        if num not in ytrainsplit:
            ytrainsplit[num] = 1
        else:
            ytrainsplit[num] += 1
    for num in ytest:
        if num not in ytestsplit:
            ytestsplit[num] = 1
        else:
            ytestsplit[num] += 1
    print('y train split')
    print(ytrainsplit)
    print('y test split')
    print(ytestsplit)
    text_MLB.fit(xtrain, ytrain)
    labels = ['Diagnosed', 'Undiagnosed']
    scores = cross_val_score(text_MLB, xtest, ytest, cv=10)
    acc = scores.mean()
    print("10-fold Cross Validation Accuracy: %0.2f percent" % (acc *100))
    predictions = text_MLB.predict(xtest)
    
    print("\nConfusion matrix")
    print(confusion_matrix(ytest, predictions))
    
    print("\nClassification report")
    print(classification_report(ytest, predictions))
    return text_MLB
def SGD(dframe, tfbool, ngrams, alpha_value, loss_func):
    if tfbool:
        text_SGD = Pipeline(([('vect', CountVectorizer(ngram_range = ngrams)), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(alpha = alpha_value, loss = loss_func, max_iter = 1000)) ]))
    else:
        text_SGD = Pipeline(([('vect', CountVectorizer(ngram_range = ngrams)), ('clf', SGDClassifier(alpha = alpha_value, loss =loss_func, max_iter = 1000)) ]))
    
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], test_size = 0.34, random_state = 41, shuffle = True)
    ytrainsplit = {}
    ytestsplit = {}
    for num in ytrain:
        if num not in ytrainsplit:
            ytrainsplit[num] = 1
        else:
            ytrainsplit[num] += 1
    for num in ytest:
        if num not in ytestsplit:
            ytestsplit[num] = 1
        else:
            ytestsplit[num] += 1
    print('y train split')
    print(ytrainsplit)
    print('y test split')
    print(ytestsplit)
    text_SGD.fit(xtrain, ytrain)
    labels = ['Diagnosed', 'Undiagnosed']
    scores = cross_val_score(text_SGD, xtest, ytest, cv=10)
    acc = scores.mean()
    print("10-fold Cross Validation Accuracy: %0.2f percent" % (acc *100))
    predictions = text_SGD.predict(xtest)
    
    print("\nConfusion matrix")
    print(confusion_matrix(ytest, predictions))
    
    print("\nClassification report")
    print(classification_report(ytest, predictions))
    return text_SGD
    
def SGD_tune(dframe):
    text_SGD = Pipeline(([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(max_iter = 1000)) ]))
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], test_size = 0.34, random_state = 41, shuffle = True)
    text_SGD.fit(xtrain, ytrain)
    best_score, best_params = SGD_hypertune(text_SGD,xtrain, ytrain)
    
def SGD_tune2(dframe):
    text_SGD = Pipeline(([('vect', CountVectorizer()), ('clf', SGDClassifier(max_iter = 1000)) ]))
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], test_size = 0.34, random_state = 41, shuffle = True)
    text_SGD.fit(xtrain, ytrain)
    best_score, best_params = SGD_hypertune2(text_SGD,xtrain, ytrain)
    
def MNB_tune(dframe):
    text_MLB = Pipeline(([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()) ]))
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], test_size = 0.34, random_state = 41, shuffle = True)
    text_MLB.fit(xtrain, ytrain)
    best_score, best_params = MB_hypertune(text_MLB,xtrain, ytrain)
    
def MNB_tune2(dframe):
    text_MLB = Pipeline(([('vect', CountVectorizer()),  ('clf', MultinomialNB()) ]))
    xtrain, xtest, ytrain, ytest = train_test_split(dframe[0], dframe[1], test_size = 0.34, random_state = 41, shuffle = True)
    text_MLB.fit(xtrain, ytrain)
    best_score, best_params = MB_hypertune2(text_MLB,xtrain, ytrain)
    
#choice 1 is generic, 2 is excluding list1, 3 is including list1 
def tune_all(choice, list1):
    dlist_all, ulist_all = get_stemmed_data('all')
    dlist_anxiety, ulist_anxiety = get_stemmed_data('anxiety')
    dlist_depression, ulist_depression = get_stemmed_data('depressed')
    
    if choice == 1:
      #  ddict_all, udict_all = get_stemmed_data('all')
       # dframe_all = prepare_dataframe(ddict_all, udict_all)
       # MNB_tune(dframe_all)
        
       # ddict_anxiety, udict_anxiety = get_stemmed_data('anxiety')
       # dframe_anxiety = prepare_dataframe(ddict_anxiety, udict_anxiety)
       # MNB_tune(dframe_anxiety)
        
        dict_depression, udict_depression = get_stemmed_data('depressed')
        dframe_depression = prepare_dataframe(dict_depression, udict_depression)
        MNB_tune(dframe_depression)
        
    elif choice == 2:
        ddict_all, udict_all = get_stemmed_data('all')
        dframe_all = prepare_dataframe(ddict_all, udict_all)
        MNB_tune(dframe_all)
        
        ddict_anxiety, udict_anxiety = get_stemmed_data('anxiety')
        dframe_anxiety = prepare_dataframe(ddict_anxiety, udict_anxiety)
        MNB_tune(dframe_anxiety)
        
        dict_depression, udict_depression = get_stemmed_data('depressed')
        dframe_depression = prepare_dataframe(dict_depression, udict_depression)
        MNB_tune(dframe_depression)
    elif choice == 3:
        ddict_all, udict_all = get_stemmed_data('all')
        dframe_all = prepare_dataframe(ddict_all, udict_all)
        MNB_tune(dframe_all)
        
        ddict_anxiety, udict_anxiety = get_stemmed_data('anxiety')
        dframe_anxiety = prepare_dataframe(ddict_anxiety, udict_anxiety)
        MNB_tune(dframe_anxiety)
        
        dict_depression, udict_depression = get_stemmed_data('depressed')
        dframe_depression = prepare_dataframe(dict_depression, udict_depression)
        MNB_tune(dframe_depression)
    

In [None]:
allddict, alludict = get_stemmed_data_anxiety('topical')
ddframe = prepare_dataframe(allddict, alludict)
SGD_tune(ddframe)

In [None]:
allddict, alludict = get_stemmed_data_depression('topical')
ddframe = prepare_dataframe(allddict, alludict)
SGD(ddframe, True, (1,2), 0.01, 'perceptron')

In [None]:
allddict, alludict = get_stemmed_data_anxiety('topical')
ddframe = prepare_dataframe(allddict, alludict)
MNB_tune2(ddframe)

In [None]:
allddict, alludict = get_stemmed_data_anxiety('topical')
ddframe = prepare_dataframe(allddict, alludict)
MNB(ddframe, False, (1,1), 0.1)

In [None]:
def write_stemmed_data():
    dlist, ulist = get_all_users()
    danxiety, uanxiety = get_anxiety_users()
    ddepressed, udepressed = get_depression_users()
    
    ddict, udict = prepare_datasets(dlist, ulist, True)
    
    dall_file = open('dstemmed_all.txt', 'w')
    danxiety_file = open('dstemmed_anxiety.txt', 'w')
    ddepressed_file = open('dstemmed_depressed.txt', 'w')
    
    for user in ddict:
        dall_file.write(user)
        dall_file.write(':;')
        dall_file.write(ddict[user])
        dall_file.write('\n')
        
        if user in danxiety:
            danxiety_file.write(user)
            danxiety_file.write(':;')
            danxiety_file.write(ddict[user])
            danxiety_file.write('\n')
            
        if user in ddepressed:
            ddepressed_file.write(user)
            ddepressed_file.write(':;')
            ddepressed_file.write(ddict[user])
            ddepressed_file.write('\n')
    
    dall_file.close()
    danxiety_file.close()
    ddepressed_file.close()
    
    uall_file = open('ustemmed_all.txt', 'w')
    uanxiety_file = open('ustemmed_anxiety.txt', 'w')
    udepressed_file = open('ustemmed_depressed.txt', 'w')
    
    for user in udict:
        uall_file.write(user)
        uall_file.write(':;')
        uall_file.write(udict[user])
        uall_file.write('\n')
        
        if user in uanxiety:
            uanxiety_file.write(user)
            uanxiety_file.write(':;')
            uanxiety_file.write(udict[user])
            uanxiety_file.write('\n')
            
        if user in udepressed:
            udepressed_file.write(user)
            udepressed_file.write(':;')
            udepressed_file.write(udict[user])
            udepressed_file.write('\n')
    uall_file.close()
    uanxiety_file.close()
    udepressed_file.close()
    
def write_stemmed_data2(extension, includinglist):
    dlist, ulist = get_all_users()
    ddict, udict = prepare_datasets2_including(dlist, ulist, True, includinglist)
    
    dwritefilename = 'dstemmed_'+extension + '.txt'
    dwritefile = open(dwritefilename, 'w')
    for user in ddict:
        dwritefile.write(user)
        dwritefile.write(':;')
        dwritefile.write(ddict[user])
        dwritefile.write('\n')
    dwritefile.close()
    
    uwritefilename = 'ustemmed_'+extension + '.txt'
    
    uwritefile = open(uwritefilename,'w')
    for user in udict:
        uwritefile.write(user)
        uwritefile.write(':;')
        uwritefile.write(udict[user])
        uwritefile.write('\n')
    uwritefile.close()



In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')

def sentiment_dataset(ddict, udict):
    
    sia = SentimentIntensityAnalyzer()
  #  scores = sia.polarity_scores()
    dsentiments = {}
    usentiments = {}
    
    for user in ddict:
        scores = sia.polarity_scores(ddict[user])
        break
    
    for user in udict:
        break
    
    
sentiment_dataset(anxiety_files, unanxiety_files)

In [None]:
#fill data structures

filelist = ['general_discussion.txt','hobbies.txt','topical_discussion.txt','mental_health_and_support.txt','physical_health_and_wellness.txt']
keylist = ['General', 'Hobbies', 'Topical Discussion', 'Mental Health and Support', 'Physical Health and Wellness']
finaldict = {}
exclusive_list = []

for index in range(len(filelist)):
    finaldict[keylist[index]] = []
    tempfile = open(filelist[index],'r')
   
    for line in tempfile:
        finaldict[keylist[index]].append(line.strip())
        if not keylist[index] == 'Mental Health and Support':
            exclusive_list.append(line.strip())
    tempfile.close()
general_list = finaldict['General']
hobbies_list = finaldict['Hobbies']
topical_list = finaldict['Topical Discussion']
mental_list = finaldict['Mental Health and Support']
physical_list = finaldict['Physical Health and Wellness']

print(len(general_list))
print(len(hobbies_list))
print(len(topical_list))
print(len(mental_list))
print(len(physical_list))
print(len(exclusive_list))
#print(exclusive_list)

In [None]:
write_stemmed_data2('exclusive', exclusive_list)

In [None]:
only_anxiety_dfiles = []
only_depressed_dfiles = []
only_anxiety_ufiles = []
only_depressed_ufiles = []

anxietyfile = open('only_anxiety_proven.txt','r')
for user in anxietyfile:
    
    subfile = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfile = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
    only_anxiety_dfiles.append(subfile)
    only_anxiety_dfiles.append(comfile)
    
unanxietyfile = open('only_depressed_proven.txt', 'r')
for user in unanxietyfile:
    
    subfile = 'demoji_diagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfile = 'demoji_diagnosed/' + user.strip() + '_comments_stripped.txt'
    only_depressed_dfiles.append(subfile)
    only_depressed_dfiles.append(comfile)
    
anxietyfile2 = open('only_anxiety_undiagnosed.txt','r')
for user in anxietyfile2:
    
    subfile = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfile = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
    only_anxiety_ufiles.append(subfile)
    only_anxiety_ufiles.append(comfile)
    
unanxietyfile2 = open('only_depressed_undiagnosed.txt', 'r')
for user in unanxietyfile2:
    
    subfile = 'demoji_undiagnosed/' + user.strip() + '_submissions_stripped.txt'
    comfile = 'demoji_undiagnosed/' + user.strip() + '_comments_stripped.txt'
    only_depressed_ufiles.append(subfile)
    only_depressed_ufiles.append(comfile)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

# Set device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 25, kernel_size=3)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(25, 50)
      
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        
        x = self.softmax(x)
        return x

# Define the loss function
loss_function = nn.CrossEntropyLoss()

# Set the parameters
size = 3
filters = 25
pool_length = "all"
dense_layers = 1
dense_dim = 50
dropout = 0.0
class_balance = "sampled"

# Create an instance of the CNN model
model = CNN()#.to(device)

# Define the optimizer
optimizer = optim.Adam(model.parameters())

# Print the model summary
print(model)

learning_rate = 1e-3
epochs = 50

for epoch in range(epochs):
    loss = loss_function(batch)
    loss.backward()
    optimizer.step()

# Example usage of the model
input_data = torch.randn(1, 3, size, size).to(device)
output = model(input_data)
print("Output shape:", output.shape)

In [None]:
#fasttext classifier stuff
def fast_boi(extension):
    trainpath = 'fasttext_' + extension + '_train.txt'
    testpath =  'fasttext_' + extension + '_test.txt'
    
    # Train the classifier
    classifier = fasttext.train_supervised(
        input=trainpath,
        lr=0.1,
        epoch=25,
        wordNgrams=2,
        bucket=200000,
        dim=50,
        loss='softmax'
    )
    
    result = classifier.test(testpath)
    print(f'Results: {result}')