# Load and Clean the Data 

## The first part below is where were clean the posts and comments. In this instance, we treat the title and post as its own singular comment and add it in with the other comments in one data frame.

In [87]:
#import necessary libaries, modules, and functions 
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from seaborn import set_style
set_style("whitegrid")
import seaborn as sns
import contractions

In [88]:
#a function that removes emojis from strings 
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [89]:
#define a function that will clean our text: break up strings into words, put all words in lowercase, remove special characters  
def clean_text(text):
    #lowercase text
    text = text.lower()
    #expand contractions into full words 
    text = contractions.fix(text)
    #remove emojis 
    text = remove_emoji(text)
    #remove \n 
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    #remove punctuation
    text = res = re.sub(r'[^\w\s]', '', text)
    #remove spaces and tabs
    text = re.sub(r'\s+', " ", text).strip()
    #remove nonword characters 
    text = re.sub(r"\W", " ", text) 
    # substituting one or more white space which is at  
        # beginning of the string with an empty string 
    text = re.sub(r"^\s+", "", text) 
    # substituting one or more white space which is at 
        # end of the string with an empty string 
    text = re.sub(r"\s+$", "", text) 
    return text

In [90]:
#load in csv file of comments 
BPD_medicine_read = pd.read_csv("validation_file.csv")
#drop the authors with no username
BPD_medicine_read.dropna(subset=['Author'], inplace=True)
#clean the comments 
BPD_medicine_read["Comment"] = BPD_medicine_read["Comment"].apply(clean_text)
#load in csv file of posts 
#BPD_medicine_posts = pd.read_csv("Desktop/Erdos/Data-science-project-Mental-Health/Curls_Data_Scraping/rBPD_Data/medicine_search.csv")
#clean the titles of the posts
#BPD_medicine_posts["Title"] = BPD_medicine_posts["Title"].apply(clean_text)
#clean the post text 
#cleaned_post_text = [clean_text(str(BPD_medicine_posts["Post Text"][i])) for i in range(0,len(BPD_medicine_posts))]
#BPD_medicine_posts["Post Text"] = cleaned_post_text
#combine the title and post text into a new column called 'Title and Text'
#BPD_medicine_posts['Title and Text'] = BPD_medicine_posts['Title'].astype(str) + BPD_medicine_posts['Post Text']
result_df = BPD_medicine_read

In [91]:
#define a function with arguments of a data frame of reddit posts, a column of author flair IDs within the data frame, and a specific value k
#def flair_check(df, flair_column,k): 
    #check that within the data frame in the author flair ID column in a certain row the entry is "user has bpd" 
#    if df[flair_column][k] == 'user has bpd':
    #if this is the case we set the variable flair_statement as ' i have bpd '
#        flair_statement = ' i have bpd '
    #if this is not the case
#    else:
        #the variable flair_statement is set to be an empty string/space
#        flair_statement = ' '
        #the function returns the variable flair_statement 
#    return flair_statement
    
#create a new column called 'Flair Text' which contains the flair_statement variable string for each author 
#BPD_medicine_posts['Flair Text'] = [flair_check(BPD_medicine_posts, 'Author Flair',i) for i in range(0,len(BPD_medicine_posts))]
#create a new column called 'Title and Text and Flair' which contains the title, text, and flair of the author as a concatenated string
#BPD_medicine_posts['Title and Text and Flair'] = BPD_medicine_posts['Title and Text'].astype(str) + BPD_medicine_posts['Flair Text']

In [92]:
#create a new data frame from the post data with the same columns as the comment data 
#posts_to_comments_df = pd.DataFrame.from_dict({'Unnamed: 0': BPD_medicine_posts['Unnamed: 0'], 'Comment': BPD_medicine_posts['Title and Text and Flair'], 'Author': BPD_medicine_posts['Post Author'], 'Post': BPD_medicine_posts['ID']})
#create a list of data frames
#frames = [posts_to_comments_df,BPD_medicine_read]
#concatenate the list of data frames into one new data frame 
#result_df = pd.concat(frames)
#remove any author without usernames 
#result_df.dropna(subset=['Author'], inplace=True)
#combine rows in data frame with the same author
aggregation_functions = {'Unnamed: 0': 'sum', 'Comment': lambda x: ', '.join(x), 'Post':lambda x: ', '.join(x)}
df_new = result_df.groupby(result_df['Author']).aggregate(aggregation_functions)
#show new and cleaned data frame 
df_new

Unnamed: 0_level_0,Unnamed: 0,Comment,Post
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-Massive-Feeling-,2,what medications work best for when you have s...,qv78g6
-Rosewiththorns-,3,lamictal lamotrigine what combination okay rig...,1agxmz3
0w_l,4,new medication increase and addition hey guys ...,mtba08
0ystersbutnopearls,5,lithium and lamictal i have been on lithium 90...,12xrl1n
100260,6,it seems like no combination of meds will ever...,"172cqod, 15tt6zt"
...,...,...,...
PTSDemi,548,lamictal experiencesside effects i have been o...,"14htbbs, 14htbbs, 14htbbs, 14htbbs"
Pappa_frankuuu,549,lamictal online tya to anyone that has advice ...,1b0qxii
ParkerFree,550,paranoia is getting worse hi guys i have recen...,"172cqod, 172cqod"
ParkingError7236,551,tw have i took enough to cause damage today i ...,1co30o9


## In this part, we will create several groups (classes, even though they're not neatly arranged as such) of functions and associated lists which will allow us to assign numerical values to three variables that we will eventually train models on: 'comment is highly relevant/very relevant/relevant/minimally relevant/not', 'user seeks treatment/not', 'user recommends their treatment/not'. 

In [93]:
#import necessary libaries, modules, and functions 
from nrclex import NRCLex
from drug_named_entity_recognition import find_drugs
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [94]:
#We create four lists of key phrases which will we will check the comments for in order to conclude that the user is self-identifying their diagnosis.
#create a list of key phrases to check for bpd in a string
#the first two data frames contain phrases that would suggest the author has the illness
bpd_diag_phrases_df1 = pd.read_csv("Diagnosis_part1.csv")
bpd_diag_phrases_df2 = pd.read_csv("Diagnosis_part2.csv")
#we change the data frames into lists 
diagnosis_list_part1 = bpd_diag_phrases_df1["Diagnosis Text1"].tolist()
diagnosis_list_part2 = bpd_diag_phrases_df2["Diagnosis Text2"].tolist()
#we create four lists which concatenate the diagnosis phrases from the data frame with different ways of phrasing 'bpd'
bpd_pa1 = [str(item + " bpd").lower() for item in diagnosis_list_part1]
bpd_pa2 = [str("bpd " + item).lower() for item in diagnosis_list_part2]
bpd_pa3 = [str(item + " borderline personality disorder").lower() for item in diagnosis_list_part1]
bpd_pa4 = [str("borderline personality disorder " + item).lower() for item in diagnosis_list_part2]
#This list is our final list of key phrases for bpd diagnosis 
Diagnosis_of_bpd_list = bpd_pa1 + bpd_pa2 + bpd_pa3 + bpd_pa4
#create a list of key phrases to check for depression in a string
#we create ten lists which concatenate the diagnosis phrases from the data frame with different ways of phrasing 'depression'
dep_pa1 = [str(item + " major depressive disorder").lower() for item in diagnosis_list_part1]
dep_pa2 = [str("major depressive disorder " + item).lower() for item in diagnosis_list_part2]
dep_pa3 = [str(item + " clinical depression").lower() for item in diagnosis_list_part1]
dep_pa4 = [str("clinical depression " + item).lower() for item in diagnosis_list_part2]
dep_pa5 = [str(item + " depression").lower() for item in diagnosis_list_part1]
dep_pa6 = [str("depression " + item).lower() for item in diagnosis_list_part2]
dep_pa7 = [str(item + " mdd").lower() for item in diagnosis_list_part1]
dep_pa8= [str("mdd " + item).lower() for item in diagnosis_list_part2]
dep_pa9 = [str(item + " major depression").lower() for item in diagnosis_list_part1]
dep_pa10 = [str("major depression " + item).lower() for item in diagnosis_list_part2]
#This list is our final list of key phrases for depression diagnosis 
Diagnosis_of_depression_list = dep_pa1 + dep_pa2 + dep_pa3 + dep_pa4 + dep_pa5 + dep_pa6 + dep_pa7 + dep_pa8 + dep_pa9 + dep_pa10
#create a list of key phrases to check for ptsd in a string
#we create ten lists which concatenate the diagnosis phrases from the data frame with different ways of phrasing 'ptsd'
ptsd_pa1 = [str(item + " post-traumatic stress disorder").lower() for item in diagnosis_list_part1]
ptsd_pa2 = [str("post-traumatic stress disorder " + item).lower() for item in diagnosis_list_part2]
ptsd_pa3 = [str(item + " post traumatic stress disorder").lower() for item in diagnosis_list_part1]
ptsd_pa4 = [str("post traumatic stress disorder " + item).lower() for item in diagnosis_list_part2]
ptsd_pa5 = [str(item + " ptsd").lower() for item in diagnosis_list_part1]
ptsd_pa6 = [str("ptsd " + item).lower() for item in diagnosis_list_part2]
ptsd_pa7 = [str(item + " post traumatic").lower() for item in diagnosis_list_part1]
ptsd_pa8= [str("post traumatic " + item).lower() for item in diagnosis_list_part2]
ptsd_pa9 = [str(item + " post traumatic stress").lower() for item in diagnosis_list_part1]
ptsd_pa10 = [str("post traumatic stress " + item).lower() for item in diagnosis_list_part2]
#This list is our final list of key phrases for ptsd diagnosis 
Diagnosis_of_ptsd_list = ptsd_pa1 + ptsd_pa2 + ptsd_pa3 + ptsd_pa4 + ptsd_pa5 + ptsd_pa6 + ptsd_pa7 + ptsd_pa8 + ptsd_pa9 + ptsd_pa10
#create a list of key phrases to check for bipolar in a string
#we create eighteen lists which concatenate the diagnosis phrases from the data frame with different ways of phrasing 'bipolar disorder'
bip_pa1 = [str(item + " bipolar").lower() for item in diagnosis_list_part1]
bip_pa2 = [str("bipolar " + item).lower() for item in diagnosis_list_part2]
bip_pa3 = [str(item + " bipolar disorder").lower() for item in diagnosis_list_part1]
bip_pa4 = [str("bipolar disorder " + item).lower() for item in diagnosis_list_part2]
bip_pa5 = [str(item + " bipolar I").lower() for item in diagnosis_list_part1]
bip_pa6 = [str("bipolar I " + item).lower() for item in diagnosis_list_part2]
bip_pa7 = [str(item + " bipolar II").lower() for item in diagnosis_list_part1]
bip_pa8= [str("bipolar II " + item).lower() for item in diagnosis_list_part2]
bip_pa9 = [str(item + " cyclothymic disorder").lower() for item in diagnosis_list_part1]
bip_pa10 = [str("cyclothymic disorder " + item).lower() for item in diagnosis_list_part2]
bip_pa11 = [str(item + " bipolar 2").lower() for item in diagnosis_list_part1]
bip_pa12 = [str("bipolar 2 " + item).lower() for item in diagnosis_list_part2]
bip_pa13 = [str(item + " bipolar 1").lower() for item in diagnosis_list_part1]
bip_pa14 = [str("bipolar 1 " + item).lower() for item in diagnosis_list_part2]
bip_pa15 = [str(item + " bipolar one").lower() for item in diagnosis_list_part1]
bip_pa16 = [str("bipolar one " + item).lower() for item in diagnosis_list_part2]
bip_pa17 = [str(item + " bipolar two").lower() for item in diagnosis_list_part1]
bip_pa18= [str("bipolar two " + item).lower() for item in diagnosis_list_part2]
#This list is our final list of key phrases for bipolar disorder diagnosis 
Diagnosis_of_bipolar_list = bip_pa1 + bip_pa2 + bip_pa3 + bip_pa4 + bip_pa5 + bip_pa6 + bip_pa7 + bip_pa8 + bip_pa9 + bip_pa10 + bip_pa11 + bip_pa12 + bip_pa13 + bip_pa14 + bip_pa15 + bip_pa16 + bip_pa17 + bip_pa18

In [95]:
#This collection of functions categorizes diagnosis

#For simplicity, we only check for a diagnosis of bpd and possible comorbid diagnoses of biploar disorder, depression, and PTSD.
#define a function with arguments being a string, "text," and four lists of key phrases which will enable us to conclude the user is self-identifying their diagnosis.
def get_diagnosis_number(text,bpd_list,dep_list,ptsd_list,bip_list):
#we iterate over the items in each list and count the times any phrase from any list appears 
    bpd_score = 0
    dep_score = 0
    ptsd_score = 0
    bip_score = 0
    for word in bpd_list:
        if word in text:
            bpd_score += 1
            for item in dep_list:
                if item in text:
                    dep_score += 1
            for item2 in ptsd_list:
                if item2 in text:
                    ptsd_score += 1 
            for item3 in bip_list:
                if item3 in text:
                    bip_score += 1
        else:
            bpd_score += 0
            dep_score += 0
            ptsd_score += 0
            bip_score += 0
        #we return a quadruple which counts up the times any phrase is mentioned for each diagnosis
    return (bpd_score,dep_score,ptsd_score,bip_score)

#create a function that takes in the the counts of phrases from each of the diagnosis lists 
def get_diagnosis_score(bpd_score,dep_score,ptsd_score,bip_score):
    #the diag_score variable is a sum of all phrase counts
    diag_score = int(bpd_score) + int(dep_score) + int(ptsd_score) + int(bip_score) 
    #the function returns the value associated to the diag_score variable
    return diag_score

#create another function that takes in the the counts of phrases from each of the diagnosis lists 
def get_diagnosis_name(bpd_score,dep_score,ptsd_score,bip_score):
    #we put the quadruple of phrase counts in binary form with each binary form corresponding to a specific diagnosis 
    if (bpd_score != 0) and (dep_score == 0) and (ptsd_score == 0) and (bip_score == 0):
        diagnosis = "borderline personality disorder"
    elif (bpd_score != 0) and (dep_score != 0) and (ptsd_score == 0) and (bip_score == 0):
        diagnosis = "borderline personality disorder and depression"
    elif (bpd_score != 0) and (dep_score == 0) and (ptsd_score != 0) and (bip_score == 0):
        diagnosis = "borderline personality disorder and post-traumatic stress disorder" 
    elif (bpd_score != 0) and (dep_score == 0) and (ptsd_score == 0) and (bip_score != 0):
        diagnosis = "borderline personality disorder and bipolar disorder"
    elif (bpd_score != 0) and (dep_score != 0) and (ptsd_score != 0) and (bip_score == 0):
        diagnosis = "borderline personality disorder, depression, and post-traumatic stress disorder"
    elif (bpd_score != 0) and (dep_score == 0) and (ptsd_score != 0) and (bip_score != 0):
        diagnosis = "borderline personality disorder, post-traumatic stress disorder, and bipolar disorder" 
    elif (bpd_score != 0) and (dep_score != 0) and (ptsd_score == 0) and (bip_score != 0):
        diagnosis = "borderline personality disorder, depression, and bipolar disorder"
    elif (bpd_score != 0) and (dep_score != 0) and (ptsd_score != 0) and (bip_score != 0):
        diagnosis = "borderline personality disorder, depression, post-traumatic stress disorder, and bipolar disorder"
    else: 
        diagnosis = "indeterminate"
        #the function will return the string associated to the diagnosis variable   
    return diagnosis

In [96]:
#We create a list of therapies which will we will check the comments for in order to conclude that the user is identifying a part of their treatment plan.
#import a csv file that lists therapies as a data frame
therapy_df = pd.read_csv("Therapy_types.csv")
#convert the data frame to a list 
therapy_check_list = therapy_df["Therapy"].tolist()
#change all the words in the therapy list to be in lower case
formated_therapy_check_list = [item.lower() for item in therapy_check_list]

In [97]:
#This collection of functions categorizes therapies

#create a function that takes in a string 'text' and a list of therapy phrases 'therapy_list' as defined previously
def get_therapy_number(text,formated_therapy_check_list):
    #we iterate over the terms in the in the therapy_list if the phrase appears in the text, we increase a count by 1, otherwise increase by 0
    therapy_score = 0
    for word in formated_therapy_check_list:
        if word in text:
            therapy_score += 1
        else:
            therapy_score += 0
    #the function returns a variable called therapy_score which is a sum of all the times a phrase appears in the text
    return therapy_score 

#create a function that takes in a string 'text' and a list of therapies 'formated_therapy_check_list' as defined previously
def get_therapy_list(text,formated_therapy_check_list):
    #we create an empty list called possible_therapies and iterate over the list of therapies in 'formated_therapy_check_list' checking if the word appears in the text or not, if it is, the word is appended to the list
    possible_therapies = []
    for word in therapy_check_list:
        if word in text:
            possible_therapies.append(word)
        else:
            pass
    #the function returns the list possible_therapies
    return possible_therapies

In [98]:
#We create a list of medications which will we will check the comments for in order to conclude that the user is identifying a part of their treatment plan.
#load in a csv file of medication names as a data frame
meds_file = pd.read_csv("list_of_meds.csv")
#convert the data frame to a list 
medications_list = meds_file['medication'].tolist()

In [99]:
#This collection of functions categorizes drugs
#Note that we define two different functions that check for drug names slightly differently. One function utilizes a NER and the other utilizes a csv file of medication names.

#create a function that takes in some text as a string 'corpus'
def list_of_drugs(corpus):
    #create an empty list 
    list_of_drugs = []  
    #use the find_drugs function to return a list of triples where the first entry is a dictionary stating the name of a drug and information on that drug, the split function tokenizes the text, and is_ignore_case will enable the function to ignore the case of any drug name 
    drugs_list = find_drugs(corpus.split(" "),is_ignore_case=True)
    #check to make sure at least one drug is mentioned
    if drugs_list != []:
        #If it is, we split up the triple into three different lists 
        list_of_dicts, listn1, listn2 = zip(*drugs_list)
        #we create a list of the first entries in the triples which should be dictionaries 
        drug_info_list = list(list_of_dicts)
        #for each dictionary in our list 
        for dict1 in drug_info_list: 
            #we append the name of the drug which is the first value in each dictionary to the empty list we created
            list_of_drugs.append(dict1['name'])
        #if there are no drugs mentioned
    else:
    #we don't do anything, and the list_of_drugs will remain empty
        pass 
    #the function will return a list of the mentioned drugs in the text 
    return list_of_drugs

#create a function that takes in some text as a string 'corpus' and a list of medication names
def list_of_meds(corpus,medications_list):
    #the method is pretty similar to the last function and at least one function before this, we create an empty string
    med_list = []
    #we split the text up into words 
    split_text = corpus.split(" ")
    #we iterate over the names of medications in the list 
    for item in medications_list:
        #if the item appears in the split text list we append the item to the empty list 
        if item in split_text:
            med_list.append(item)
        #if not the list stays empty
        else:
            pass
        #the function will return a list of the mentioned drugs in the text     
    return med_list

#create a function that in takes in two lists of drugs that appear in a text one from an NER and another from a csv file 
def get_drug_score(list_of_drugs_in_corpus,list_of_meds_in_corpus):
    #if either of the lists entered is nonempty 
    if len(list_of_drugs_in_corpus) != 0 or len(list_of_meds_in_corpus) != 0:
        #we create a variable called 'drug_score' which is the number of elements in the combined lists
        drug_score = len(list_of_drugs_in_corpus + list_of_meds_in_corpus)
    else:
        #we set the variable as zero if both lists are empty
        drug_score = 0
        #the function will return the integer associated to the variable 
    return drug_score

In [100]:
#we combine the the two previous collections of functions to check whether the user has mentioned some kind of treatment 

#we create a function that takes in the count of therapy phrases and the count of drug mentions 
def is_receiving_treatment_score(therapy_score, drug_score):
    #we define a binary variable called treatment_score
    #the variable is assigned 1 if the user mentions any kind of treatment, and 0 otherwise 
    if (therapy_score != 0) and (drug_score != 0):
        treatment_score = 1
    elif (therapy_score != 0) and (drug_score == 0):
        treatment_score = 1
    elif (therapy_score == 0) and (drug_score != 0):
        treatment_score = 1   
    else: 
        treatment_score = 0
        #the function returns the integer associated to the variable 
    return treatment_score

#we create a function that takes in the treatment_score from the previous function 
def is_receiving_treatment(treatment_score):
    #we define a variable called treatment_ans which is assigned a string value stating whether the user is receiving treatment or not
    if treatment_score == 1:
        treatment_ans = "This user is receiving treatment"
    else:
        treatment_ans = "This user is not receiving treatment"
    #the function returns the variable 
    return treatment_ans

In [101]:
#This collection of functions categorizes a general user feeling about their treatment plan and a reccomendation 

#we create a function that takes text in the form of a string as the argument
def sentiment_score(text):
    #we initialize the instance of a sentiment analyzer 
    sia = SIA()
    #we define a variable pol_score which will measure the polarity of the text  
    #polarity_scores will give you a dictionary of different scores. The negative, neutral, and positive scores are related: They all add up to 1 and can’t be negative. 
    #The compound score is calculated differently. It’s not just an average, and it can range from -1 to 1. Its values lie in [-1,1] where -1 denotes a highly negative sentiment and 1 denotes a highly positive sentiment.
    pol_score = sia.polarity_scores(text)
    #we ignore the other scores and select 'compound' as the overarching sentiment in a comment, we assign this to a variable sent_score
    sent_score = pol_score['compound']
    #the function returns the variable sent_score 
    return sent_score

#we create a function that takes in text in the form of a string and a treatment score as given by a previous function, this variable is binary as the arguments
def recommend_therapy_score(text, is_receiving_treatment_score):
    #when applying the previoius function if the sentiment score is at least 0.5, we take this to mean a positive attitude toward their treatment and if the user is getting some form of treatment
    if (sentiment_score(text) >= 0.5) and (is_receiving_treatment_score == 1):
        #we define a variable called rec_score 
        #a value of 1 indicates the user would recommend their treatment to another person 
        rec_score = 1
        #a value of 0 indicates they would not 
    else:
        rec_score = 0
        #the function returns the variable 
    return rec_score

#we create a function that takes in the rec_score from the previous function 
def would_recommend_therapy(rec_score):
    #we define a variable called rec_ans that will return a string indicating whether the user would recommend their treatment plan. 
    if rec_score == 1:
        rec_ans = "This user recommends their treatment plan."
    else:
        rec_ans = "This user does not recommend their treatment plan."
    return rec_ans

In [102]:
#we load in a csv file as a data frame that contains keywords related to diagnosis and treatment of bpd
keyword_file = pd.read_csv("Desktop/Erdos/Data-science-project-Mental-Health/Curls_Data_Scraping/keywords.csv")
#we convert the data frame to a list of keywords 
keywords = keyword_file['therapy'].tolist()

In [103]:
#This collection of functions categorizes whether a comment is relevant to discussions of diagnosis and treatment of bpd

#we create a function that takes text in the form of a string and the keywords list as previously defined
def get_keyword_score(text,keywords):
    #we name a variable called keyword_score which will count the number of words that appear in the text from the keywords list 
    keyword_score = 0
    for key in keywords:
        if key in text:
            keyword_score += 1
        else:
            keyword_score += 0
    #the function returns the variable 
    return keyword_score

#we define a function that intakes the previouly defined diagnosis, keyword, drug, and therapy scores 
def is_relevant_score(diagnosis_score,keyword_score,drug_score,therapy_score):
    #we assign a specific value to a variable called relevancy_score depending on how many of the scores are nonzero 
    if (keyword_score !=0) and (drug_score != 0) and (therapy_score !=0) and (diagnosis_score !=0):
        relevancy_score = 4
    elif (keyword_score !=0) and (drug_score != 0) and (therapy_score !=0) and (diagnosis_score ==0):
        relevancy_score = 3
    elif (keyword_score !=0) and (drug_score != 0) and (therapy_score ==0) and (diagnosis_score !=0):
        relevancy_score = 3
    elif (keyword_score !=0) and (drug_score == 0) and (therapy_score !=0) and (diagnosis_score !=0):
        relevancy_score = 3
    elif (keyword_score ==0) and (drug_score != 0) and (therapy_score !=0) and (diagnosis_score !=0):
        relevancy_score = 3
    elif (keyword_score !=0) and (drug_score != 0) and (therapy_score ==0) and (diagnosis_score ==0):
        relevancy_score = 2
    elif (keyword_score !=0) and (drug_score == 0) and (therapy_score !=0) and (diagnosis_score !=0):
        relevancy_score = 2
    elif (keyword_score !=0) and (drug_score == 0) and (therapy_score ==0) and (diagnosis_score !=0):
        relevancy_score = 2
    elif (keyword_score ==0) and (drug_score != 0) and (therapy_score !=0) and (diagnosis_score ==0):
        relevancy_score = 2
    elif (keyword_score ==0) and (drug_score != 0) and (therapy_score ==0) and (diagnosis_score !=0):
        relevancy_score = 2
    elif (keyword_score ==0) and (drug_score == 0) and (therapy_score !=0) and (diagnosis_score !=0):
        relevancy_score = 2
    elif (keyword_score !=0) and (drug_score == 0) and (therapy_score ==0) and (diagnosis_score ==0):
        relevancy_score = 1
    elif (keyword_score ==0) and (drug_score != 0) and (therapy_score ==0) and (diagnosis_score ==0):
        relevancy_score = 1
    elif (keyword_score ==0) and (drug_score == 0) and (therapy_score !=0) and (diagnosis_score ==0):
        relevancy_score = 1
    elif (keyword_score ==0) and (drug_score == 0) and (therapy_score ==0) and (diagnosis_score !=0):
        relevancy_score = 1
    else:
        relevancy_score = 0
        #the function returns the variable 
    return relevancy_score

#create a function that intakes the previous relevancy_score as the argument 
def is_relevant(relevancy_score):
    #we define a variable called relevancy_ans which classifies the text as highly relevant, very relevant, relevant, minimally relevant, or not relevant depending on the value of relevancy_score
    if relevancy_score == 4:
        relevancy_ans = "This information is highly relevant." 
    elif relevancy_score == 3:
        relevancy_ans = "This information is very relevant." 
    elif relevancy_score == 2:
        relevancy_ans = "This information is relevant." 
    elif relevancy_score == 1:
        relevancy_ans = "This information is minimally relevant." 
    else:
        relevancy_ans = "This information is not relevant." 
        #the function returns the variable 
    return relevancy_ans

## In this final part, we use the previous functions to add new columns with data to the previous data frame 

In [104]:
#we add most of the outputs from the prevous functions as their own column in the data frame 
df_new['Diagnosis_Number'] = [get_diagnosis_number(df_new['Comment'].iloc[i],Diagnosis_of_bpd_list,Diagnosis_of_depression_list,Diagnosis_of_ptsd_list,Diagnosis_of_bipolar_list) for i in range(0, len(df_new))]
df_new['Diagnosis'] = [get_diagnosis_name(df_new['Diagnosis_Number'].iloc[i][0],df_new['Diagnosis_Number'].iloc[i][1],df_new['Diagnosis_Number'].iloc[i][2],df_new['Diagnosis_Number'].iloc[i][3]) for i in range(0, len(df_new))]
df_new['Therapy_Number'] = [get_therapy_number(df_new['Comment'].iloc[i],formated_therapy_check_list) for i in range(0, len(df_new))]
df_new['Therapies'] = [get_therapy_list(df_new['Comment'].iloc[i],formated_therapy_check_list) for i in range(0, len(df_new))]
df_new['Drugs'] = [list_of_meds(df_new['Comment'].iloc[i],medications_list) + list_of_drugs(df_new['Comment'].iloc[i]) for i in range(0, len(df_new))]
df_new['Drug_Number'] = [get_drug_score(list_of_meds(df_new['Comment'].iloc[i],medications_list),list_of_drugs(df_new['Comment'].iloc[i])) for i in range(0, len(df_new))]
df_new['Treatment_Number'] = [is_receiving_treatment_score(df_new['Therapy_Number'].iloc[i], df_new['Drug_Number'].iloc[i])for i in range(0, len(df_new))]
df_new['Treatment_Confirmation'] = [is_receiving_treatment(df_new['Treatment_Number'].iloc[i]) for i in range(0, len(df_new))]
df_new['Sentiment_of_Treatment'] = [sentiment_score(df_new['Comment'].iloc[i]) for i in range(0, len(df_new))]
df_new['Recommendation_Score'] = [recommend_therapy_score(df_new['Comment'].iloc[i],df_new['Treatment_Number'].iloc[i]) for i in range(0, len(df_new))]
df_new['Recommendation'] = [would_recommend_therapy(df_new['Recommendation_Score'].iloc[i]) for i in range(0, len(df_new))]
df_new['Keyword_Score'] = [get_keyword_score(df_new['Comment'].iloc[i],keywords) for i in range(0, len(df_new))]
df_new['Relevancy_Score'] = [is_relevant_score(get_diagnosis_score(df_new['Diagnosis_Number'].iloc[i][0],df_new['Diagnosis_Number'].iloc[i][1],df_new['Diagnosis_Number'].iloc[i][2],df_new['Diagnosis_Number'].iloc[i][3]),df_new['Keyword_Score'].iloc[i], df_new['Drug_Number'].iloc[i], df_new['Therapy_Number'].iloc[i]) for i in range(0, len(df_new))]
df_new['Relevancy'] = [is_relevant(df_new['Relevancy_Score'].iloc[i]) for i in range(0, len(df_new))]
df_new

Unnamed: 0_level_0,Unnamed: 0,Comment,Post,Diagnosis_Number,Diagnosis,Therapy_Number,Therapies,Drugs,Drug_Number,Treatment_Number,Treatment_Confirmation,Sentiment_of_Treatment,Recommendation_Score,Recommendation,Keyword_Score,Relevancy_Score,Relevancy
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
-Massive-Feeling-,2,what medications work best for when you have s...,qv78g6,"(0, 0, 0, 0)",indeterminate,0,[],"[zoloft, prozac, seroquel, lamictal, lamotrigi...",34,1,This user is receiving treatment,-0.9477,0,This user does not recommend their treatment p...,17,2,This information is relevant.
-Rosewiththorns-,3,lamictal lamotrigine what combination okay rig...,1agxmz3,"(0, 0, 0, 0)",indeterminate,0,[],"[ssri, lamictal, lamotrigine, abilify, prometh...",18,1,This user is receiving treatment,-0.9603,0,This user does not recommend their treatment p...,6,2,This information is relevant.
0w_l,4,new medication increase and addition hey guys ...,mtba08,"(0, 0, 0, 0)",indeterminate,0,[],"[seroquel, quetiapine, lamictal, lamotrigine, ...",23,1,This user is receiving treatment,0.8320,1,This user recommends their treatment plan.,10,2,This information is relevant.
0ystersbutnopearls,5,lithium and lamictal i have been on lithium 90...,12xrl1n,"(0, 0, 0, 0)",indeterminate,0,[],"[lamictal, lithium, stabilizer, Lamotrigine, L...",9,1,This user is receiving treatment,-0.9740,0,This user does not recommend their treatment p...,3,2,This information is relevant.
100260,6,it seems like no combination of meds will ever...,"172cqod, 15tt6zt","(0, 0, 0, 0)",indeterminate,0,[],"[prozac, lamictal, wellbutrin, xanax, abilify,...",21,1,This user is receiving treatment,0.8707,1,This user recommends their treatment plan.,9,2,This information is relevant.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PTSDemi,548,lamictal experiencesside effects i have been o...,"14htbbs, 14htbbs, 14htbbs, 14htbbs","(0, 0, 0, 0)",indeterminate,0,[],"[lamictal, Lamotrigine, Lamotrigine]",3,1,This user is receiving treatment,0.7934,1,This user recommends their treatment plan.,3,2,This information is relevant.
Pappa_frankuuu,549,lamictal online tya to anyone that has advice ...,1b0qxii,"(0, 0, 0, 0)",indeterminate,0,[],"[lamictal, Lamotrigine, Lamotrigine]",3,1,This user is receiving treatment,-0.9575,0,This user does not recommend their treatment p...,2,2,This information is relevant.
ParkerFree,550,paranoia is getting worse hi guys i have recen...,"172cqod, 172cqod","(0, 0, 0, 0)",indeterminate,0,[],"[lamotrigine, Lamotrigine, Lamotrigine]",3,1,This user is receiving treatment,-0.4860,0,This user does not recommend their treatment p...,1,2,This information is relevant.
ParkingError7236,551,tw have i took enough to cause damage today i ...,1co30o9,"(0, 0, 0, 0)",indeterminate,0,[],"[lamotrigine, propranolol, Lamotrigine, Buspir...",5,1,This user is receiving treatment,-0.2732,0,This user does not recommend their treatment p...,1,2,This information is relevant.


In [106]:
#we create a csv file of the previous data frame
enumerated_scraped_file = open("enumerated_test_comments.csv", "w")
df_new.to_csv('enumerated_test_comments.csv', index=True)  
#we close the file when we're done 
enumerated_scraped_file.close()