# Descriptive Statistics of the Rare Disease Reddit Data

The purpose of this notebook is to gather descriptive statistics on the rare disease textual data from Reddit. The output of this notebook will be the form of the dictionary
that will convert to a pandas dataframe - output as a tsv.

```
{subreddit:
     {"num_unique_posts_comments":int,
      "GARD_IDs_mentioned":list[str],
      "num_GARD IDs mentioned":int(len(GARD_IDs_mentioned)),
      "unique_GARD_IDs_mentioned":set[str],
      "num_unique_GGARD IDs mentioned":int(len(unique_GGARD_IDs_mentioned)),
      "cosine_similarity of the GARD IDs":[Unknown, so None],
      "num_EOM_models":int,
      "num_Leaf_models":int,
      "num_all-MiniLM-L6-v2_models":int,
      "num_universal-sentence-encoder_models":int,
      "num_universal-sentence-encoder-multilingual_models":int,
      "num_paraphrase-multilingual-MiniLM-L12-v2_models":int,
      "num_distiluse-base-multilingual-cased_models":int,
     }
 }
```

To do this we will need to 
1. Implement GARD ID searching
2. Load the data for iteration
3. Get statistics and add new data post-hoc in pandas

and iteratively test along the way.

## GARD ID Searching

This is improved from the [NCATS/epi4gard repo](https://github.com/ncats/epi4GARD/blob/e6aeca9450b259c9ad0b1fcdc62f4e3de777f600/api/epi_pipeline.py#L298)

Going to find lists of five letter words to add to the STOPWORDS list since our algo's use some exact matching

In [1]:
import requests
r = requests.get('https://raw.githubusercontent.com/powerlanguage/word-lists/master/1000-most-common-words.txt')
five_and_less = {word for word in r.text.split('\n') if len(word)<=5}

In [2]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [3]:
STOPWORDS.update(five_and_less)

In [4]:
len(five_and_less)

716

In [5]:
len(five_and_less)

716

In [6]:
'als' in five_and_less

False

In [7]:
'cf' in five_and_less

False

In [8]:
'go' in five_and_less

True

In [9]:
## Section: GARD SEARCH
# can identify rare diseases in text using the GARD dictionary from neo4j
# and map a GARD ID, name, or synonym to all of the related synonyms for searching APIs
from typing import List, Dict, Union, Optional, Set, Tuple
import requests, string, random
from nltk import tokenize as nltk_tokenize
class GARD_Search:
    def __init__(self):
        import json, codecs
        #These are opened locally so that garbage collection removes them from memory
        try:
            with codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig') as f:
                diseases = json.load(f)
        except:
            r = requests.get('https://raw.githubusercontent.com/ncats/epi4GARD/master/EpiExtract4GARD/gard-id-name-synonyms.json')
            diseases = json.loads(r.content)
        
        from nltk.corpus import stopwords
        try:
            STOPWORDS = set(stopwords.words('english'))
            r = requests.get('https://raw.githubusercontent.com/powerlanguage/word-lists/master/1000-most-common-words.txt')
            five_and_less = {word for word in r.text.split('\n') if len(word)<=5}
            STOPWORDS.update(five_and_less)
        except:
            import nltk
            nltk.download('stopwords')
            STOPWORDS = set(stopwords.words('english'))
            
            #'https://raw.githubusercontent.com/sindresorhus/word-list/main/words.txt'
            r = requests.get('https://raw.githubusercontent.com/powerlanguage/word-lists/master/1000-most-common-words.txt')
            five_and_less = {word for word in r.text.split('\n') if len(word)<=5}
            STOPWORDS.update(five_and_less)
        
        #This should be a list of all GARD IDs for purposes like random choice for testing
        GARD_id_list = [entry['gard_id'] for entry in diseases]
        #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
        GARD_dict = {}
        #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
        max_length = -1
        for entry in diseases:
            if entry['name'] not in GARD_dict.keys():
                s = entry['name'].lower().strip()
                if s not in STOPWORDS and len(s)>1:
                    GARD_dict[s] = entry['gard_id']
                    #compare length
                    max_length = max(max_length,len(s.split()))

            if entry['synonyms']:
                for synonym in entry['synonyms']:
                    if synonym not in GARD_dict.keys():
                        s = synonym.lower().strip()
                        if s not in STOPWORDS and len(s)>3:
                            GARD_dict[s] = entry['gard_id']
                            max_length = max(max_length,len(s.split()))
        
        GARD_dict['cf'] = 'GARD:0006233'
        GARD_dict['als'] = 'GARD:0005786'
        GARD_dict.pop('dyspraxia')
        GARD_dict.pop('fava')
        GARD_dict.pop('arms')
        
        # For some reason this one-liner doesn't work: (I think it is because of the sort function)
        #self.id_dict = {gard_id:[k for k,v in GARD_dict.items() if v==gard_id].sort(reverse=True, key=lambda x:len(x)) for gard_id in self.id_list}
        id_dict = dict()
        for gard_id in GARD_id_list:
            l = [k for k,v in GARD_dict.items() if v==gard_id]
            l.sort(reverse=True, key=lambda x:len(x)) 
            id_dict[gard_id] = l
        
        self.id_list = GARD_id_list
        # Returns a dictionary in form of 
        self.name_dict = GARD_dict
        # Returns a dictionary in form of {"GARD_ID":["Longest Disease Name/Synonym","2nd Longest Name/Synonym","Synonym",...]}
        self.id_dict = id_dict
        self.max_length = max_length
    
    def __str__(self) -> str:
        return str(
'''
Instantiation: rd_identify = GARD_Search() \n
Calling: diseases, ids = rd_identify(text) \n
Search GARD ID or any name and get a list of all disease names: \ndz_name_list = rd_identify.autosearch(searchterm) \n
GARD ID List: rd_identify.id_list \n ["GARD:0000001", "GARD:0000002"] \n
Name Dictionary: rd_identify.name_dict \n {"Name":"GARD_ID"} \n
GARD ID Dictionary: rd_identify.ID_dict \n {"GARD_ID":["Longest Disease Name/Synonym", "2nd Longest Name/Synonym", ...]}
''')
    
    def __call__(self, sentence:str) -> Tuple[List[str], List[str]]:
        return self.get_diseases(sentence)
    
    #Works much faster if broken down into sentences.
    #compares every phrase in a sentence to see if it matches anything in the GARD dictionary of diseases.
    def get_diseases(self, sentence:str) -> Tuple[List[str], List[str]]:   
        tokens = [s.lower().strip() for s in nltk_tokenize.word_tokenize(sentence) if s not in string.punctuation]
        #print("raw tokens",tokens)
        
        #Combine 's with the previous word
        while "'s" in tokens:
            pop_index = tokens.index("'s")
            #print(pop_index)
            if pop_index>0:
                tokens[pop_index-1]=tokens[pop_index-1]+"'s"
            tokens.pop(pop_index)

        #print("processed tokens",tokens)
        
        diseases = []
        ids = []
        i=0
        #Iterates through every word, builds string that is max_length or less to compare.
        while i <len(tokens):
            #print("i",i)
            #Find out the length of the comparison string, either max_length or less. This brings algorithm from O(n^2) to O(n) time
            compare_length = min(len(tokens)-i, self.max_length)

            #print("compare_length",compare_length)

            #Compares longest sequences first and goes down until there is a match
            while compare_length>0:
                s = ' '.join(tokens[i:i+compare_length]).lower()

                #print("Comparator:",s)

                if s in self.name_dict.keys():
                   # print("s in self.GARD_dict.keys()",s in self.GARD_dict.keys())

                    diseases.append(s)
                    ids.append(self.name_dict[s])
                    #Need to skip over the next few indexes
                    i+=compare_length-1
                    #print('found',self.name_dict[s],"new i is",i)
                    break
                else:
                    compare_length-=1
            i+=1
        return diseases,ids
    
    #Can search by 7-digit GARD_ID, 12-digit "GARD:{GARD_ID}", matched search term, or arbitrary search term
    #Returns list of terms to search by
    # search_term_list = autosearch(search_term, GARD_dict)
    def autosearch(self, searchterm:Union[str,int], matching=2) -> List[str]:
        #comparisons below only handly strings, allows int input
        if type(searchterm) is not str:
            searchterm = str(searchterm)
        
        #for the disease names to match
        searchterm = searchterm.lower()
        
        while matching>=1:
            #search in form of 'GARD:0000001'
            if 'gard:' in searchterm and len(searchterm)==12:
                searchterm = searchterm.replace('gard:','GARD:')
                l = self.id_dict[searchterm]
                l.sort(reverse=True, key=lambda x:len(x))
                if len(l)>0:
                    print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
                    return l

            #can take int or str of digits of variable input
            #search in form of 777 or '777' or '00777' or '0000777'
            elif searchterm[0].isdigit() and searchterm[-1].isdigit():
                if len(searchterm)>7:
                    raise ValueError('GARD ID IS NOT VALID. RE-ENTER SEARCH TERM')
                searchterm = 'GARD:'+'0'*(7-len(str(searchterm)))+str(searchterm)
                l = self.id_dict[searchterm]
                l.sort(reverse=True, key=lambda x:len(x))
                if len(l)>0:
                    print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
                    return l

            #search in form of 'mackay shek carr syndrome' and returns all synonyms ('retinal degeneration with nanophthalmos, cystic macular degeneration, and angle closure glaucoma', 'retinal degeneration, nanophthalmos, glaucoma', 'mackay shek carr syndrome')
            #considers the GARD ID as the lemma, and the search term as one form. maps the form to the lemma and then uses that lemma to find all related forms in the GARD dict. 
            elif searchterm in self.name_dict.keys():
                print("currently in form search")
                print("searchterm in self.GARD_dict.keys()",searchterm in self.name_dict.keys())
                #must convert the term back to a GARD ID
                l = self.id_dict[self.name_dict[searchterm]]
                print("self.get_names_from_id(searchterm)",l)
                
                l.sort(reverse=True, key=lambda x:len(x))
                print("l.sort(reverse=True, key=lambda x:len(x))", l)
                

                print("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: ",l)
                return l

            else:
                #This can be replaced with some other common error in user input that is easily fixed
                searchterm = searchterm.replace('-',' ')
                searchterm = searchterm.replace("'s","")
                return self.autosearch(searchterm, matching-1)
        print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
        return [searchterm]

    # Useful for testing
    # Return a random GARD_ID Search Term list
    def random_disease_list(self) -> List[str]:
        return random.choice(list(self.id_dict.values()))
    
    # Return a random disease term
    def random_disease(self) -> str:
        return random.choice(self.random_disease_list())

    # Return a random GARD_ID
    def random_id(self) -> str:
        return random.choice(self.id_list)

In [10]:
gard_rd_data = GARD_Search()

In [11]:
print(gard_rd_data)


Instantiation: rd_identify = GARD_Search() 

Calling: diseases, ids = rd_identify(text) 

Search GARD ID or any name and get a list of all disease names: 
dz_name_list = rd_identify.autosearch(searchterm) 

GARD ID List: rd_identify.id_list 
 ["GARD:0000001", "GARD:0000002"] 

Name Dictionary: rd_identify.name_dict 
 {"Name":"GARD_ID"} 

GARD ID Dictionary: rd_identify.ID_dict 
 {"GARD_ID":["Longest Disease Name/Synonym", "2nd Longest Name/Synonym", ...]}



Testing changes made to the GARD_Search class while writing this notebook

- Improved `get_diseases`
- Added all random generation functions for easy testing of other functions
- Added `id_dict`, which replaced using the functionality in `autosearch`
- Renamed most of the object data attributes and updated the __str___ repr to be more descriptive

In [12]:
gard_rd_data.id_list[:5]

['GARD:0000001',
 'GARD:0000003',
 'GARD:0000004',
 'GARD:0000005',
 'GARD:0000006']

In [13]:
gard_rd_data.random_disease_list()

["masson's vegetant intravascular hemangio-endothelioma",
 'intravascular papillary endothelial hyperplasia',
 "masson's pseudoangiosarcoma",
 "masson's tumor"]

In [14]:
gard_rd_data.random_disease()

'dens in dente and palatal invaginations'

In [15]:
gard_rd_data.random_id()

'GARD:0009266'

Testing changes made to GARD_Search class to improve `get_diseases`

In [16]:
'cf' in gard_rd_data.name_dict.keys()

True

In [17]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /Users/wzk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
test = "hansen's disease, and amyotrophic lateral sclerosis"
sentences = sent_tokenize(test)
gard_list = []
for sent in sentences:
    _, ids = gard_rd_data.get_diseases(sent)
    gard_list+=ids
print(gard_list)

['GARD:0006886', 'GARD:0005786']


In [19]:
test2 = "Without a doubt " + gard_rd_data.random_disease()+" is a very serious disease.\n as is "+ gard_rd_data.random_disease() +" and charcot marie tooth. why not also CF, and ALS, or cystic fibrosis, and Feldman's syndrome, and Langerhan's disorder? \nDefinitely don't go to town without a good test"
print(test2)

Without a doubt familial hyperaldosteronism type 3 is a very serious disease.
 as is idiopathic dilatation of the pulmonary artery and charcot marie tooth. why not also CF, and ALS, or cystic fibrosis, and Feldman's syndrome, and Langerhan's disorder? 
Definitely don't go to town without a good test


In [20]:
'als' in gard_rd_data.name_dict.keys()

True

In [21]:
sentences = sent_tokenize(test2)
gard_list = []
for sent in sentences:
    _, ids = gard_rd_data.get_diseases(sent)
    gard_list+=ids

for e in gard_list:
    print(gard_rd_data.id_dict[e][0])

familial hyperaldosteronism type iii
idiopathic dilatation of the pulmonary artery
cystic fibrosis
amyotrophic lateral sclerosis type 1
cystic fibrosis


In [22]:
gard_list

['GARD:0012362',
 'GARD:0006757',
 'GARD:0006233',
 'GARD:0005786',
 'GARD:0006233']

Testing to see if these entries have synonyms with high likelihood of showing false positives

In [23]:
gard_rd_data.id_dict[gard_rd_data.name_dict['fibro-adipose vascular anomaly']]

['fibro-adipose vascular anomaly', 'fibro adipose vascular anomaly']

In [24]:
gard_rd_data.id_dict['GARD:0006961']

['malaria']

In [25]:
gard_rd_data.id_dict[gard_rd_data.name_dict['fibro-adipose vascular anomaly']]

['fibro-adipose vascular anomaly', 'fibro adipose vascular anomaly']

In [26]:
gard_rd_data.id_dict[gard_rd_data.name_dict['autoimmune encephalitis']]

['autoimmune encephalitis']

In [27]:
gard_rd_data.id_dict[gard_rd_data.name_dict['occult spinal dysraphism sequence']]

['occult spinal dysraphism sequence',
 'tethered spinal cord syndrome',
 'segmental vertebral anomalies',
 'occult spinal dysraphism',
 'tethered cord syndrome']

In [28]:
gard_rd_data.id_dict[gard_rd_data.name_dict['ledderhose disease']]

['ledderhose disease', 'lederhose disease']

In [29]:
gard_rd_data.id_dict[gard_rd_data.name_dict['gerodermia osteodysplastica']]

['gerodermia osteodysplastica',
 'geroderma osteodysplasticum',
 'geroderma osteodysplastica',
 'walt disney dwarfism']

## Load the data that will be investigated

In [30]:
import os
data_path = "/Users/wzk/Library/CloudStorage/OneDrive-UniversityofTexasSouthwestern/NCATS.Research/bradley's data for rare disease social media project"

results_path = os.path.join(data_path,"results")
posts_path = os.path.join(data_path,"posts")
comments_path = os.path.join(data_path,"comments")

dz_list_w_results = os.listdir(results_path)
dz_list_w_posts = os.listdir(posts_path)
dz_list_w_comments = os.listdir(comments_path)

# Remove directories and files that are not relevant to a numerical description of the data
dz_list_w_results.remove(".DS_Store")
dz_list_w_results.remove("max_coherence")
dz_list_w_posts.remove(".DS_Store")
dz_list_w_posts.remove("post_data.json")
dz_list_w_comments.remove(".DS_Store")

#dz_list_w_results = [e.lower() for e in dz_list_w_results]
#dz_list_w_posts = [e.lower() for e in dz_list_w_posts]
#dz_list_w_comments = [e.lower() for e in dz_list_w_comments]

In [31]:
import json

Look at Posts data structure

In [32]:
with open(os.path.join(posts_path,"AcousticNeuroma_posts.json")) as f:
    posts_data = json.load(f)

In [33]:
posts_data[1:4]

[{'all_awardings': [],
  'allow_live_comments': False,
  'author': 'Bookmom25',
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  'author_flair_text': None,
  'author_flair_type': 'text',
  'author_fullname': 't2_1qm4igkj',
  'author_patreon_flair': False,
  'can_mod_post': False,
  'contest_mode': False,
  'created_utc': 1563379489,
  'domain': 'self.AcousticNeuroma',
  'full_link': 'https://www.reddit.com/r/AcousticNeuroma/comments/ceeu84/hi_im_new_here/',
  'gildings': {},
  'id': 'ceeu84',
  'is_crosspostable': True,
  'is_meta': False,
  'is_original_content': False,
  'is_reddit_media_domain': False,
  'is_robot_indexable': True,
  'is_self': True,
  'is_video': False,
  'link_flair_background_color': '',
  'link_flair_richtext': [],
  'link_flair_text_color': 'dark',
  'link_flair_type': 'text',
  'locked': False,
  'media_only': False,
  'no_follow': True,
  'num_comments': 10,
  'num_crossposts': 0,
  'over_18': False,
  'permalink': '/r/AcousticNeuroma/commen

Look at Comments data structure

In [34]:
with open(os.path.join(comments_path,"AcousticNeuroma_comments.json")) as f:
    comments_data = json.load(f)

In [35]:
comments_data[1:4]

[{'all_awardings': [],
  'allow_live_comments': False,
  'author': 'Bookmom25',
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  'author_flair_text': None,
  'author_flair_type': 'text',
  'author_fullname': 't2_1qm4igkj',
  'author_patreon_flair': False,
  'can_mod_post': False,
  'contest_mode': False,
  'created_utc': 1563379489,
  'domain': 'self.AcousticNeuroma',
  'full_link': 'https://www.reddit.com/r/AcousticNeuroma/comments/ceeu84/hi_im_new_here/',
  'gildings': {},
  'id': 'ceeu84',
  'is_crosspostable': True,
  'is_meta': False,
  'is_original_content': False,
  'is_reddit_media_domain': False,
  'is_robot_indexable': True,
  'is_self': True,
  'is_video': False,
  'link_flair_background_color': '',
  'link_flair_richtext': [],
  'link_flair_text_color': 'dark',
  'link_flair_type': 'text',
  'locked': False,
  'media_only': False,
  'no_follow': True,
  'num_comments': 10,
  'num_crossposts': 0,
  'over_18': False,
  'permalink': '/r/AcousticNeuroma/commen

As we can see, `comment['selftext']` overlaps with `post['selftext']`; `comment['all_text']` repeats `comment['selftext']`; and the equivalent of `'all_text'` in posts is `post['title']+' '+post['selftext]`.

To ensure that there are no duplicate textual entries, we need to analyze all `comment['all_text']` and `post['title']+' '+post['selftext]` if there are no comments on a post.

Look at Results data structure

In [36]:
for dz in dz_list_w_results:
    dz_results_path = os.path.join(results_path,dz)
    models_created = os.listdir(dz_results_path)
    print(models_created)
    print()
    print(sum('leaf' in model for model in models_created if 'csv' not in model))
    break
#    with open(os.path.join(comments_path,"AcousticNeuroma_comments.json")) as f:

['visualsnow_doc2vec_leaf_1', 'visualsnow_all-MiniLM-L6-v2', 'visualsnow_doc2vec_leaf_0', 'visualsnow_doc2vec_leaf_1_topics.csv', 'visualsnow_universal-sentence-encoder', '.DS_Store', 'visualsnow_doc2vec_eom_2_topics.csv', 'visualsnow_doc2vec_leaf_4_topics.csv', 'visualsnow_doc2vec_eom_0', 'visualsnow_doc2vec_leaf_3_topics.csv', 'visualsnow_doc2vec_eom_1', 'visualsnow_doc2vec_eom_0_topics.csv', 'visualsnow_paraphrase-multilingual-MiniLM-L12-v2', 'visualsnow_distiluse-base-multilingual-cased', 'visualsnow_doc2vec_leaf_2', 'visualsnow_doc2vec_eom_3_topics.csv', 'visualsnow_doc2vec_leaf_3', 'visualsnow_doc2vec_leaf_4', 'visualsnow_doc2vec_leaf_0_topics.csv', 'visualsnow_doc2vec_eom_4', 'visualsnow_doc2vec_eom_1_topics.csv', 'visualsnow_doc2vec_eom_3', 'visualsnow_universal-sentence-encoder-multilingual', 'visualsnow_doc2vec_eom_2', 'visualsnow_doc2vec_eom_4_topics.csv', 'visualsnow_doc2vec_leaf_2_topics.csv']

5


We will filter by character count to exclude video/link/low character posts

In [37]:
comment_ids = {e['id'] for e in comments_data if len(e['selftext'])>5}
post_ids = {e['id'] for e in posts_data if len(e['selftext'])>5}
unique_ids = comment_ids.union(post_ids)

The difference below would be how much we would filter out if only using 'selftext' id as the entry. However we are using 'alltext' from comments and 'title' + 'selftext' from posts

In [38]:
print(len(posts_data),len(comments_data),len(unique_ids))

127 127 110


## Putting it all together
Write a loop through all of the posts and comments data using the variables from above:

`dz_list_w_results`  
`dz_list_w_posts`  
`dz_list_w_comments`

In [39]:
print(len(dz_list_w_posts),len(dz_list_w_comments),len(dz_list_w_results))

170 89 77


Not all of the subreddits have post, comment, and model creation data. Due to the fact that some comments/posts are duplicated we will have to iterate through all subreddits and only make changes whereever data exists. We also have to ensure that we are only working on unique post/comment entries because those were shown to be duplicated between posts and comments

In [40]:
reddit_analysis_overview = dict()

Creating this list of subreddits to check

In [41]:
subreddits_w_posts = [post_json.replace("_posts.json","") for post_json in dz_list_w_posts]
subreddits_w_comments = [comments_json.replace("_comments.json","") for comments_json in dz_list_w_comments]
subreddits_w_results = dz_list_w_results

In [42]:
subreddits_w_comments_posts = set(subreddits_w_posts).intersection(set(subreddits_w_comments))
subreddits_w_comments_posts_results = set(subreddits_w_posts).intersection(set(subreddits_w_comments),set(subreddits_w_results))
subreddits_posts_only = set(subreddits_w_posts).difference(set(subreddits_w_comments))
subreddits_comments_only = set(subreddits_w_comments).difference(set(subreddits_w_posts))

In [43]:
for e in [subreddits_w_posts,
          subreddits_w_comments,
          subreddits_w_results,
          subreddits_w_comments_posts,
          subreddits_w_comments_posts_results,
          subreddits_posts_only,
          subreddits_comments_only]:
    print(len(e))

170
89
77
89
77
81
0


In [44]:
#Verify that there are no result directories that we are going to search that do not have post/comment data
x = set(subreddits_w_results).difference(set(subreddits_w_posts).union(set(subreddits_w_comments)))
print(x,len(x))

set() 0


These are all of the subreddits that need to be parsed:

In [45]:
union_subreddits = set(subreddits_w_posts).union(set(subreddits_w_comments),set(subreddits_w_results))
len(union_subreddits)

170

Test opening some data

In [46]:
posts_file = os.path.join(posts_path,"poliosis_posts.json")
if os.path.isfile(posts_file):
    #Load posts
    with open(posts_file) as f:
        posts_data = json.load(f)

    post_ids = set()
    for e in posts_data:
        if 'selftext' in e.keys() and 'title' in e.keys():
            if len(e['title']+e['selftext'])>5:
                post_ids.add(e['id'])

In [47]:
comments_file = os.path.join(comments_path,"poliosis_comments.json")
if os.path.isfile(comments_file):
    #Load comments
    with open(comments_file) as f:
        comments_data = json.load(f)

    comment_ids = set()
    for e in comments_data:
        if 'all_text' in e.keys() and 'title' in e.keys():
            if len(e['all_text'])>5:
                comment_ids.add(e['id'])

In [48]:
'8nj5ti' in post_ids

True

In [49]:
'8nj5ti' in comment_ids

False

### 2. Parse data

In [50]:
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

# This is going to iterate through all subreddits that have at least one 
# results, posts, or comments data and only perform operations on the data that exists
for subreddit in tqdm(union_subreddits):
    # Initialize base dictionary for each subreddit 
    reddit_analysis_overview[subreddit]={
      "num_unique_posts_comments":0,
      "GARD_IDs_mentioned":list(),
      "num_GARD_IDs_mentioned":0,
      "unique_GARD_IDs_mentioned":set(),
      "num_unique_GARD_IDs_mentioned":0,
      "cosine_similarity_of_GARD_IDs":None,
      "num_EOM_models":0,
      "num_Leaf_models":0,
      "num_all-MiniLM-L6-v2_models":0,
      "num_universal-sentence-encoder_models":0,
      "num_universal-sentence-encoder-multilingual_models":0,
      "num_paraphrase-multilingual-MiniLM-L12-v2_models":0,
      "num_distiluse-base-multilingual-cased_models":0,
     }
    
    # COMMENTS
    comment_ids = set()
    comments_file = os.path.join(comments_path,subreddit+"_comments.json")
    if os.path.isfile(comments_file):
        #Load comments
        with open(os.path.join(comments_path,subreddit+"_comments.json")) as f:
            comments_data = json.load(f)

        #print("comments of {} loaded".format(subreddit))

        #comment_ids = {e['id'] for e in comments_data if len(e['selftext'])>5}
        #post_ids = {e['id'] for e in posts_data if len(e['selftext'])>5}
        # Cannot do set constructions because not all entries have the 'selftext' attribute and an error throws 
        for e in comments_data:
            #print('''for e in comments_data: print(e)''',e)
            if 'all_text' in e.keys():
                #print("e['all_text']",e['all_text'])
                if len(e['all_text'])>5 and e['all_text'] !='[deleted]':
                    comment_ids.add(e['id'])
    
        #print("subreddit,len(comments_data),len(comment_ids)\n",subreddit,len(comments_data),len(comment_ids),"\n")
    
    # POSTS
    post_only_ids = set()
    posts_file = os.path.join(posts_path,subreddit+"_posts.json")
    if os.path.isfile(posts_file):
        #Load posts
        with open(posts_file) as f:
            posts_data = json.load(f)
        #print("posts of {} loaded".format(subreddit))

        for e in posts_data:
            if e['id'] not in comment_ids:
                if 'selftext' in e.keys() and 'title' in e.keys():
                    if len(e['title']+e['selftext'])>5 and e['selftext'] !='[deleted]':
                        post_only_ids.add(e['id'])
    
        #print("subreddit,len(posts_data),len(post_only_ids)\n",subreddit,len(posts_data),len(post_only_ids),'\n')
    
    #print(f"subreddit {subreddit} contains posts {os.path.isfile(posts_file)} and comments {os.path.isfile(comments_file)}")
    
    # unique_ids will be dynamic so not going to initialize until the end
    #unique_ids = post_only_ids.union(comment_ids)
    
    #print(f"Number of unique entries in {subreddit} is {len(unique_ids)}.")
    reddit_analysis_overview[subreddit]["num_unique_posts_comments"]=len(post_only_ids.union(comment_ids))
    
    #Loop through all of the textual data
    #Do all of the comment data first because comment data contains post data too
    if os.path.isfile(comments_file):
        #print("analyzing comments data")
        for entry in comments_data:
            if len(post_only_ids.union(comment_ids))==0:
                #print(f"maxed out {subreddit}  @ comments")
                break
            if entry['id'] in comment_ids:
                sentences = sent_tokenize(entry['all_text'])
                Gard_id_list = list()
                for sent in sentences:
                    _, gard_ids = gard_rd_data.get_diseases(sent)
                    if gard_ids:
                        #print(sent)
                        reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]+=gard_ids
                comment_ids.remove(entry['id'])
    #else:
    #    print(f"comments_file {subreddit} does not exist")  
    
    if os.path.isfile(posts_file):
        #print("analyzing posts data")
        for entry in posts_data:
            if len(post_only_ids.union(comment_ids))==0:
                #print(f"maxed out {subreddit} @ posts")
                break
            if entry['id'] in post_only_ids:
                sentences = sent_tokenize(entry['title']+' '+entry['selftext'])
                for sent in sentences:
                    _, gard_ids = gard_rd_data.get_diseases(sent)
                    if gard_ids:
                        reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]+=gard_ids
                post_only_ids.remove(entry['id'])
    #else:
    #    print(f"posts_file {subreddit} does not exist")

    unique_ids = post_only_ids.union(comment_ids)
    if len(unique_ids)>0:
        print(f"Unique post/comment entries for {subreddit} left: {len(unique_ids)} {unique_ids}")
            
    reddit_analysis_overview[subreddit]["num_GARD_IDs_mentioned"]=len(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"])
    reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"].update(set(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]))
    reddit_analysis_overview[subreddit]["num_unique_GARD_IDs_mentioned"] = len(reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"])
    #print("len(reddit_analysis_overview[subreddit]['unique_GARD_IDs_mentioned'])",len(reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"]))
    
    # Results data (summing all of the models created)
    dz_results_path = os.path.join(results_path,subreddit)
    if os.path.exists(dz_results_path):
        #print("Analyzing dz_results_path",dz_results_path)
        models_created = os.listdir(dz_results_path)
        
        if '.DS_Store' in models_created:
            models_created.remove('.DS_Store')
        #else:
            #print(f"{models_created} did not have .DS_Store file.")
        for model in models_created:
            model_name = model.lower()
            if 'csv' not in model_name:
                if 'leaf' in model_name:
                    reddit_analysis_overview[subreddit]["num_Leaf_models"]+=1
                elif 'eom' in model_name:
                    reddit_analysis_overview[subreddit]["num_EOM_models"]+=1
                elif 'all-minilm-l6-v2' in model_name:
                    reddit_analysis_overview[subreddit]["num_all-MiniLM-L6-v2_models"]+=1
                elif 'universal-sentence-encoder-multilingual' in model_name:
                    reddit_analysis_overview[subreddit]["num_universal-sentence-encoder-multilingual_models"]+=1
                elif 'universal-sentence-encoder' in model_name:
                    reddit_analysis_overview[subreddit]["num_universal-sentence-encoder_models"]+=1
                elif 'distiluse-base-multilingual-cased' in model_name:
                    reddit_analysis_overview[subreddit]["num_distiluse-base-multilingual-cased_models"]+=1
                elif 'paraphrase-multilingual-minilm-l12-v2' in model_name:
                    reddit_analysis_overview[subreddit]["num_paraphrase-multilingual-MiniLM-L12-v2_models"]+=1
                else:
                    print(f"Unaccounted model: {model_name}")
    #else:
    #    print(f"dz_results_path {subreddit} does not exist")

 70%|█████████████████████████████████████████████████▋                     | 119/170 [15:55<12:23, 14.59s/it]

Unaccounted model: cysticfibrosis_hp_alpha_atpe_c_v_60
Unaccounted model: cysticfibrosis_hp_rand_c_v_101
Unaccounted model: cysticfibrosis_hp_atpe_c_v_134
Unaccounted model: cysticfibrosis_hp_atpe_all_c_v_117
Unaccounted model: cysticfibrosis_hp_atpe_c_v_129


100%|███████████████████████████████████████████████████████████████████████| 170/170 [18:42<00:00,  6.60s/it]


### 3. Convert into pandas dataframe to pretty print and save data easily

In [51]:
import pandas as pd 
columns = ["GARD_IDs_mentioned",
      "num_GARD_IDs_mentioned",
      "num_unique_posts_comments",
      "unique_GARD_IDs_mentioned",
      "num_unique_GARD_IDs_mentioned",
      "cosine_similarity_of_GARD_IDs",
      "num_EOM_models",
      "num_Leaf_models",
      "num_all-MiniLM-L6-v2_models",
      "num_universal-sentence-encoder_models",
      "num_universal-sentence-encoder-multilingual_models",
      "num_paraphrase-multilingual-MiniLM-L12-v2_models",
      "num_distiluse-base-multilingual-cased_models"]

#Must specify columns because the dictionary is unordered
df = pd.DataFrame.from_dict({subreddit:[reddit_analysis_overview[subreddit][dtype] 
                           for dtype in columns] for subreddit in reddit_analysis_overview.keys()},
                       orient='index',columns = columns)

In [52]:
df

Unnamed: 0,GARD_IDs_mentioned,num_GARD_IDs_mentioned,num_unique_posts_comments,unique_GARD_IDs_mentioned,num_unique_GARD_IDs_mentioned,cosine_similarity_of_GARD_IDs,num_EOM_models,num_Leaf_models,num_all-MiniLM-L6-v2_models,num_universal-sentence-encoder_models,num_universal-sentence-encoder-multilingual_models,num_paraphrase-multilingual-MiniLM-L12-v2_models,num_distiluse-base-multilingual-cased_models
Cornea_Transplant,[],0,2,{},0,,0,0,0,0,0,0,0
MeaslesParty,"[GARD:0003434, GARD:0003434, GARD:0003434, GAR...",16,40,"{GARD:0003434, GARD:0006961}",2,,0,0,0,0,0,0,0
Angioedema,"[GARD:0005979, GARD:0005979, GARD:0005979, GAR...",7,32,"{GARD:0008605, GARD:0007137, GARD:0005979}",3,,0,0,0,0,0,0,0
KawasakiDisease,"[GARD:0012124, GARD:0006816, GARD:0006816, GAR...",25,34,"{GARD:0010927, GARD:0006816, GARD:0012124, GAR...",4,,0,0,0,0,0,0,0
soundsensitivity,"[GARD:0009655, GARD:0012058, GARD:0012058, GAR...",5,9,"{GARD:0009655, GARD:0012058}",2,,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Myasthenia_Gravis,"[GARD:0007122, GARD:0007122, GARD:0007122, GAR...",74,104,"{GARD:0007922, GARD:0011902, GARD:0007122, GAR...",8,,2,3,0,1,0,0,0
EbsteinsAnomaly,"[GARD:0006313, GARD:0006313]",2,15,{GARD:0006313},1,,0,0,0,0,0,0,0
TMAU,"[GARD:0006447, GARD:0006447, GARD:0006447, GAR...",739,454,"{GARD:0011005, GARD:0006961, GARD:0008457, GAR...",10,,5,5,1,1,1,1,1
HDsupport,[GARD:0010927],1,26,{GARD:0010927},1,,0,0,0,0,0,0,0


In [53]:
#The built in mode in statistics throws error with missing list
from statistics import mode
def no_error_mode(lst):
    if lst:
        return gard_rd_data.id_dict[mode(lst)][0]
    else:
        return None

In [54]:
from statistics import mode
df.insert(loc = 0,
          column = "Mode_GARD_IDs",
          value = df["GARD_IDs_mentioned"].apply(no_error_mode))

In [55]:
df

Unnamed: 0,Mode_GARD_IDs,GARD_IDs_mentioned,num_GARD_IDs_mentioned,num_unique_posts_comments,unique_GARD_IDs_mentioned,num_unique_GARD_IDs_mentioned,cosine_similarity_of_GARD_IDs,num_EOM_models,num_Leaf_models,num_all-MiniLM-L6-v2_models,num_universal-sentence-encoder_models,num_universal-sentence-encoder-multilingual_models,num_paraphrase-multilingual-MiniLM-L12-v2_models,num_distiluse-base-multilingual-cased_models
Cornea_Transplant,,[],0,2,{},0,,0,0,0,0,0,0,0
MeaslesParty,measles,"[GARD:0003434, GARD:0003434, GARD:0003434, GAR...",16,40,"{GARD:0003434, GARD:0006961}",2,,0,0,0,0,0,0,0
Angioedema,deficiency of c1 esterase inhibitor,"[GARD:0005979, GARD:0005979, GARD:0005979, GAR...",7,32,"{GARD:0008605, GARD:0007137, GARD:0005979}",3,,0,0,0,0,0,0,0
KawasakiDisease,mucocutaneous lymph node syndrome,"[GARD:0012124, GARD:0006816, GARD:0006816, GAR...",25,34,"{GARD:0010927, GARD:0006816, GARD:0012124, GAR...",4,,0,0,0,0,0,0,0
soundsensitivity,selective sound sensitivity syndrome,"[GARD:0009655, GARD:0012058, GARD:0012058, GAR...",5,9,"{GARD:0009655, GARD:0012058}",2,,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Myasthenia_Gravis,myasthenia gravis,"[GARD:0007122, GARD:0007122, GARD:0007122, GAR...",74,104,"{GARD:0007922, GARD:0011902, GARD:0007122, GAR...",8,,2,3,0,1,0,0,0
EbsteinsAnomaly,ebstein's malformation,"[GARD:0006313, GARD:0006313]",2,15,{GARD:0006313},1,,0,0,0,0,0,0,0
TMAU,fish malodor syndrome,"[GARD:0006447, GARD:0006447, GARD:0006447, GAR...",739,454,"{GARD:0011005, GARD:0006961, GARD:0008457, GAR...",10,,5,5,1,1,1,1,1
HDsupport,cryopyrin-associated periodic syndrome,[GARD:0010927],1,26,{GARD:0010927},1,,0,0,0,0,0,0,0


In [56]:
df.tail()

Unnamed: 0,Mode_GARD_IDs,GARD_IDs_mentioned,num_GARD_IDs_mentioned,num_unique_posts_comments,unique_GARD_IDs_mentioned,num_unique_GARD_IDs_mentioned,cosine_similarity_of_GARD_IDs,num_EOM_models,num_Leaf_models,num_all-MiniLM-L6-v2_models,num_universal-sentence-encoder_models,num_universal-sentence-encoder-multilingual_models,num_paraphrase-multilingual-MiniLM-L12-v2_models,num_distiluse-base-multilingual-cased_models
Myasthenia_Gravis,myasthenia gravis,"[GARD:0007122, GARD:0007122, GARD:0007122, GAR...",74,104,"{GARD:0007922, GARD:0011902, GARD:0007122, GAR...",8,,2,3,0,1,0,0,0
EbsteinsAnomaly,ebstein's malformation,"[GARD:0006313, GARD:0006313]",2,15,{GARD:0006313},1,,0,0,0,0,0,0,0
TMAU,fish malodor syndrome,"[GARD:0006447, GARD:0006447, GARD:0006447, GAR...",739,454,"{GARD:0011005, GARD:0006961, GARD:0008457, GAR...",10,,5,5,1,1,1,1,1
HDsupport,cryopyrin-associated periodic syndrome,[GARD:0010927],1,26,{GARD:0010927},1,,0,0,0,0,0,0,0
AddisonsDisease,hypoadrenocorticism familial,"[GARD:0005740, GARD:0005740, GARD:0005740, GAR...",1045,1984,"{GARD:0007922, GARD:0001467, GARD:0005725, GAR...",96,,5,5,1,1,1,1,1


In [57]:
df.replace("\t"," ")

Unnamed: 0,Mode_GARD_IDs,GARD_IDs_mentioned,num_GARD_IDs_mentioned,num_unique_posts_comments,unique_GARD_IDs_mentioned,num_unique_GARD_IDs_mentioned,cosine_similarity_of_GARD_IDs,num_EOM_models,num_Leaf_models,num_all-MiniLM-L6-v2_models,num_universal-sentence-encoder_models,num_universal-sentence-encoder-multilingual_models,num_paraphrase-multilingual-MiniLM-L12-v2_models,num_distiluse-base-multilingual-cased_models
Cornea_Transplant,,[],0,2,{},0,,0,0,0,0,0,0,0
MeaslesParty,measles,"[GARD:0003434, GARD:0003434, GARD:0003434, GAR...",16,40,"{GARD:0003434, GARD:0006961}",2,,0,0,0,0,0,0,0
Angioedema,deficiency of c1 esterase inhibitor,"[GARD:0005979, GARD:0005979, GARD:0005979, GAR...",7,32,"{GARD:0008605, GARD:0007137, GARD:0005979}",3,,0,0,0,0,0,0,0
KawasakiDisease,mucocutaneous lymph node syndrome,"[GARD:0012124, GARD:0006816, GARD:0006816, GAR...",25,34,"{GARD:0010927, GARD:0006816, GARD:0012124, GAR...",4,,0,0,0,0,0,0,0
soundsensitivity,selective sound sensitivity syndrome,"[GARD:0009655, GARD:0012058, GARD:0012058, GAR...",5,9,"{GARD:0009655, GARD:0012058}",2,,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Myasthenia_Gravis,myasthenia gravis,"[GARD:0007122, GARD:0007122, GARD:0007122, GAR...",74,104,"{GARD:0007922, GARD:0011902, GARD:0007122, GAR...",8,,2,3,0,1,0,0,0
EbsteinsAnomaly,ebstein's malformation,"[GARD:0006313, GARD:0006313]",2,15,{GARD:0006313},1,,0,0,0,0,0,0,0
TMAU,fish malodor syndrome,"[GARD:0006447, GARD:0006447, GARD:0006447, GAR...",739,454,"{GARD:0011005, GARD:0006961, GARD:0008457, GAR...",10,,5,5,1,1,1,1,1
HDsupport,cryopyrin-associated periodic syndrome,[GARD:0010927],1,26,{GARD:0010927},1,,0,0,0,0,0,0,0


#### Save

In [58]:
df.to_csv(path_or_buf="reddit_stats.tsv", sep='\t', na_rep='',header=True,)

### Unused Code

This is the version prior to realizing that comment 'all_text' should be used instead of 'selftext'

```
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

# This is going to iterate through all subreddits that have at least one 
# results, posts, or comments data and only perform operations on the data that exists
for subreddit in tqdm(union_subreddits):
    # Initialize base dictionary for each subreddit 
    reddit_analysis_overview[subreddit]={
      "num_unique_posts_comments":0,
      "GARD_IDs_mentioned":list(),
      "num_GARD_IDs_mentioned":0,
      "unique_GARD_IDs_mentioned":set(),
      "num_unique_GARD_IDs_mentioned":0,
      "cosine_similarity_of_GARD_IDs":None,
      "num_EOM_models":0,
      "num_Leaf_models":0,
      "num_all-MiniLM-L6-v2_models":0,
      "num_universal-sentence-encoder_models":0,
      "num_universal-sentence-encoder-multilingual_models":0,
      "num_paraphrase-multilingual-MiniLM-L12-v2_models":0,
      "num_distiluse-base-multilingual-cased_models":0,
     }
    
    #print("subreddit",subreddit)
    
    #Initialize empty set and update depending on which files are available.
    unique_ids = set()
    
    # POSTS
    posts_file = os.path.join(posts_path,subreddit+"_posts.json")
    if os.path.isfile(posts_file):
        #Load posts
        with open(posts_file) as f:
            posts_data = json.load(f)
        #print("posts of {} loaded".format(subreddit))

        post_ids = set()
        for e in posts_data:
            if 'selftext' in e.keys():
                if len(e['selftext'])>5:
                    post_ids.add(e['id'])
        
        unique_ids.update(post_ids)
    
    # COMMENTS
    comments_file = os.path.join(comments_path,subreddit+"_comments.json")
    if os.path.isfile(comments_file):
        #Load comments
        with open(os.path.join(comments_path,subreddit+"_comments.json")) as f:
            comments_data = json.load(f)

        #print("comments of {} loaded".format(subreddit))

        #comment_ids = {e['id'] for e in comments_data if len(e['selftext'])>5}
        #post_ids = {e['id'] for e in posts_data if len(e['selftext'])>5}
        # Cannot do set constructions because not all entries have the 'selftext' attribute and an error throws 
        comment_ids = set()
        for e in comments_data:
            if 'selftext' in e.keys():
                if len(e['selftext'])>5:
                    comment_ids.add(e['id'])
        
        unique_ids.update(comment_ids)
    
    
    #print(f"Number of unique entries in {subreddit} is {len(unique_ids)}.")
    reddit_analysis_overview[subreddit]["num_unique_posts_comments"]=len(unique_ids)
    
    #Loop through all of the textual data
    if os.path.isfile(posts_file):
        #print("analyzing posts data")
        for entry in posts_data:
            if len(unique_ids)==0:
                break
            if entry['id'] in unique_ids:
                sentences = sent_tokenize(entry['selftext'])
                for sent in sentences:
                    _, gard_ids = gard_rd_data.get_diseases(sent)
                    if gard_ids:
                        reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]+=gard_ids
                unique_ids.remove(entry['id'])
    else:
        print(f"posts_file {posts_file} does not exist")

    if os.path.isfile(comments_file):   
        #print("analyzing comments data")
        for entry in comments_data:
            if len(unique_ids)==0:
                break
            if entry['id'] in unique_ids:
                sentences = sent_tokenize(entry['selftext'])
                Gard_id_list = list()
                for sent in sentences:
                    _, gard_ids = gard_rd_data.get_diseases(sent)
                    if gard_ids:
                        #print(sent)
                        reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]+=gard_ids
                unique_ids.remove(entry['id'])
    #else:
    #    print(f"comments_file {comments_file} does not exist")  

    if len(unique_ids)>0:
        print(f"Unique post/comment entries left: {unique_ids}")
            
    reddit_analysis_overview[subreddit]["num_GARD_IDs_mentioned"]=len(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"])
    reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"].update(set(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]))
    reddit_analysis_overview[subreddit]["num_unique_GARD_IDs_mentioned"] = len(reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"])
    #print("len(reddit_analysis_overview[subreddit]['unique_GARD_IDs_mentioned'])",len(reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"]))
    
    # Results data (summing all of the models created)
    dz_results_path = os.path.join(results_path,subreddit)
    if os.path.exists(dz_results_path):
        #print("Analyzing dz_results_path",dz_results_path)
        models_created = os.listdir(dz_results_path)
        
        if '.DS_Store' in models_created:
            models_created.remove('.DS_Store')
        else:
            print(f"{models_created} did not have .DS_Store file.")
        for model in models_created:
            model_name = model.lower()
            if 'csv' not in model_name:
                if 'leaf' in model_name:
                    reddit_analysis_overview[subreddit]["num_Leaf_models"]+=1
                elif 'eom' in model_name:
                    reddit_analysis_overview[subreddit]["num_EOM_models"]+=1
                elif 'all-minilm-l6-v2' in model_name:
                    reddit_analysis_overview[subreddit]["num_all-MiniLM-L6-v2_models"]+=1
                elif 'universal-sentence-encoder-multilingual' in model_name:
                    reddit_analysis_overview[subreddit]["num_universal-sentence-encoder-multilingual_models"]+=1
                elif 'universal-sentence-encoder' in model_name:
                    reddit_analysis_overview[subreddit]["num_universal-sentence-encoder_models"]+=1
                elif 'distiluse-base-multilingual-cased' in model_name:
                    reddit_analysis_overview[subreddit]["num_distiluse-base-multilingual-cased_models"]+=1
                elif 'paraphrase-multilingual-minilm-l12-v2' in model_name:
                    reddit_analysis_overview[subreddit]["num_paraphrase-multilingual-MiniLM-L12-v2_models"]+=1
                else:
                    print(f"Unaccounted model: {model_name}")
    #else:
    #    print(f"dz_results_path {dz_results_path} does not exist") 
```

This is an even earlier version of the code:

```
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
#Do all of the joint ones
for subreddit in tqdm(subreddits_w_comments_posts):
    reddit_analysis_overview[subreddit]={"GARD_IDs_mentioned":list()}
    #print("reddit_analysis_overview",reddit_analysis_overview)
    
    #print("subreddit",subreddit)
    
    #Load posts
    
    with open(os.path.join(posts_path,subreddit+"_posts.json")) as f:
        posts_data = json.load(f)
    #print("posts of {} loaded".format(subreddit))
    
    post_ids = set()
    for e in posts_data:
        if 'selftext' in e.keys():
            if len(e['selftext'])>5:
                post_ids.add(e['id'])
    
    #Load comments
    with open(os.path.join(comments_path,subreddit+"_comments.json")) as f:
        comments_data = json.load(f)
    
    #print("comments of {} loaded".format(subreddit))
    
    #comment_ids = {e['id'] for e in comments_data if len(e['selftext'])>5}
    #post_ids = {e['id'] for e in posts_data if len(e['selftext'])>5}
    # Cannot do set constructions because not all entries have the 'selftext' attribute and an error throws 
    comment_ids = set()
    for e in comments_data:
        if 'selftext' in e.keys():
            if len(e['selftext'])>5:
                comment_ids.add(e['id'])
    
    
    
    unique_ids = comment_ids.union(post_ids)
    
    print(f"len(unique_ids) in {} is {}".format(subreddit,len(unique_ids)))
    
    reddit_analysis_overview[subreddit]["num_unique_posts_comments"] =len(unique_ids)
    
    #Loop through all of the textual data
    while len(unique_ids)>0:
        for entry in posts_data:
            if entry['id'] in unique_ids:
                sentences = sent_tokenize(entry['selftext'])
                for sent in sentences:
                    _, gard_ids = gard_rd_data.get_diseases(sent)
                    if gard_ids:
                        #print(sent)
                        reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]+=gard_ids
                unique_ids.remove(entry['id'])
        for entry in comments_data:
            if entry['id'] in unique_ids:
                sentences = sent_tokenize(entry['selftext'])
                Gard_id_list = list()
                for sent in sentences:
                    _, gard_ids = gard_rd_data.get_diseases(sent)
                    if gard_ids:
                        #print(sent)
                        reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]+=gard_ids
                unique_ids.remove(entry['id'])
        #Do not remove this break
        break
    reddit_analysis_overview[subreddit]["num_GARD IDs mentioned"]=len(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"])
    reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"] = set(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"])
    reddit_analysis_overview[subreddit]["num_unique_GARD IDs mentioned"]=len(reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"])
```

```
#Do all of the subreddits that have posts only
for subreddit in subreddits_posts_only:
    if subreddit not in subreddits_w_comments_posts:
        with open(os.path.join(posts_path,subreddit+"_posts.json")) as f:
            posts_data = json.load(f)
        
        post_ids = set()
        
        for e in posts_data:
            if 'selftext' in e.keys():
                if len(e['selftext'])>5:
                    post_ids.add(e['id'])
        
        reddit_analysis_overview[subreddit]["num_unique_posts_comments"] =len(unique_ids)
    
    #Loop through all of the textual data
    while len(unique_ids)>0:
        for entry in posts_data:
            if entry['id'] in unique_ids:
                sentences = sent_tokenize(entry['selftext'])
                for sent in sentences:
                    _, gard_ids = gard_rd_data.get_diseases(sent)
                    if gard_ids:
                        print(sent)
                    reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"]+=gard_ids
                
                unique_ids.remove(entry['id'])
        #Do not remove this break
        break
    reddit_analysis_overview[subreddit]["num_GARD IDs mentioned"]=len(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"])
    reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"] = set(reddit_analysis_overview[subreddit]["GARD_IDs_mentioned"])
    reddit_analysis_overview[subreddit]["num_unique_GARD IDs mentioned"]=len(reddit_analysis_overview[subreddit]["unique_GARD_IDs_mentioned"])
```