## Part 1

In [9]:
from lxml import html
import requests
import wget
import os
import json

data_page="https://dmice.ohsu.edu/bedricks/"
page=requests.get(data_page)
tree=html.fromstring(page.content)
z = tree.cssselect("ul li a")
size=0
articles=0
for i in range(len(z)):
    a= z[i].attrib["href"][2:]
    size += os.path.getsize(a)
    articles += sum(1 for line in open(a))

    
print("1. Total size of dataset is {} bytes or {}Gb".format(size, size/1000**3))
print("2. There are {} articles in the data set".format(articles))

1. Total size of dataset is 1746235959 bytes or 1.746235959Gb
2. There are 500630 articles in the data set


In [None]:
for i in range(len(z)):
    a= "https://dmice.ohsu.edu/bedricks/{}".format(z[i].attrib["href"][2:])
    wget.download(a)

## Part 2 



In [10]:
from collections import defaultdict
from tabulate import tabulate

def get_author_list(author_list, from_doc):
    for names in from_doc:
        a=""
        for key in names.keys():
            a = a + names[key] + " "
        author_list[a]+=1
    return author_list

def get_mesh_index(mesh_index, mesh_head):
    for mesh in j_content["mesh_headings"]:
        if mesh["major"] == True:
            mesh_index[mesh["descriptor_name"]] += 1  
    return mesh_index


years = defaultdict(int)
authors = defaultdict(int)
mesh_index = defaultdict(int)

done=0
count=0
abstract_len=0

for i in range(len(z)):
    a= z[i].attrib["href"][2:]
    with open(a) as f:
        for line in f:
            j_content = json.loads(line)
            years[j_content['year']] += 1
            authors = get_author_list(authors,j_content["authors"])
            if "mesh_headings" in j_content.keys():
                mesh_index = get_mesh_index(mesh_index, j_content)
            if "abstract" in j_content.keys():
                abstract_len += len(j_content["abstract"])
                count += 1
            if (done == 0 and 'doi' in j_content.keys()):
                doi_init = j_content['year']
                done = 1
                
years_number=max(years.items())[0] - min(years.items())[0]
largest = max(years.items(), key=lambda x: x[1])
most_articles = max(authors.items(), key=lambda x: x[1])
sorted_mesh = sorted(mesh_index.items(), key=lambda x: x[1],reverse=True)

print("1. {} years of data is represented in the dataset".format(years_number))
print("2. Oldest record in from {}, and the newest from {}".format(min(years.items())[0],max(years.items())[0]))
print("3. Year {} had the highest number of clinical trials({})".format(largest[0],largest[1]))
print("4. The first DOI was indexed in {}".format(doi_init)) 
print("5. {} is the most prolific author with {} articles".format(most_articles[0][:-1],most_articles[1]))
print("6. The ten most common MeSH index major headings associated with clinical trials are:")            
print(tabulate(sorted_mesh[:10], headers=['MeSH Heading', 'Articles']))
print("7. Average number of words in an abstract are {}".format(int(abstract_len/count)))


1. 54 years of data is represented in the dataset
2. Oldest record in from 1962, and the newest from 2016
3. Year 2001 had the highest number of clinical trials(30053)
4. The first DOI was indexed in 1975
5. H Tanaka H is the most prolific author with 235 articles
6. The ten most common MeSH index major headings associated with clinical trials are:
MeSH Heading                               Articles
---------------------------------------  ----------
Quality of Life                                2845
Coronary Artery Bypass                         1968
Stents                                         1953
Renal Dialysis                                 1779
Exercise                                       1757
Kidney Transplantation                         1721
Premedication                                  1639
Magnetic Resonance Imaging                     1536
Dietary Supplements                            1448
Hematopoietic Stem Cell Transplantation        1441
7. Average number of word

## Part 3

In [2]:
from article_pb2 import *

def json_to_proto(a,j_content):
    for key in j_content.keys():
        if key == 'title':
            a.title = j_content["title"]

        if key == 'pmid':
            a.pmid = j_content["pmid"]

        if key == 'abstract':
            a.abstract = j_content["abstract"]

        if key == 'year':
            a.year = j_content["year"]

        if key == 'doi':
            a.doi = j_content["doi"]

        if key == 'publication data':
            for key1 in j_content['publication_data'].keys():
                a.key1=j_content[key1]

        if key == 'authors':
            for i in range(len(j_content['authors'])):
                author = a.authors.add()
                for key2 in j_content['authors'][i].keys():
                    if key2 == 'forename':
                        author.forename = j_content['authors'][i][key2]
                    if key2 == 'initials':
                        author.initials = j_content['authors'][i][key2]
                    if key2 == 'lastname':
                        author.lastname = j_content['authors'][i][key2]

        if key == 'publication_types':
            for i in range(len(j_content['publication_types'])):
                pub_type = a.publication_types.add()
                for key3 in j_content['publication_types'][i].keys():
                    if key3 == 'name':
                        pub_type.name = j_content['publication_types'][i][key3]
                    if key3 == 'ui':
                        pub_type.ui = j_content['publication_types'][i][key3]

        if key == 'mesh_headings':
            for i in range(len(j_content['mesh_headings'])):
                mesh_head = a.mesh_headings.add()
                for key4 in j_content['mesh_headings'][i].keys():
                    if key4 == 'descriptor_name':
                        mesh_head.descriptor_name = j_content['mesh_headings'][i][key4]
                    if key4 == 'major':
                        mesh_head.major = j_content['mesh_headings'][i][key4]
                    if key4 == 'ui':
                        mesh_head.ui = j_content['mesh_headings'][i][key4]
                    if key4 == 'qualifiers':
                        for j in range(len(j_content['mesh_headings'][i][key4])):
                            qual = mesh_head.qualifiers.add()
                            for key5 in j_content['mesh_headings'][i][key4][j].keys():
                                if key5 == 'qualifier_name':
                                    qual.qualifier_name=j_content['mesh_headings'][i][key4][j][key5]
                                if key5 == 'major':
                                    qual.major=j_content['mesh_headings'][i][key4][j][key5]
                                if key5 == 'ui':
                                    qual.ui=j_content['mesh_headings'][i][key4][j][key5]

    return a



In [4]:
from article_pb2 import *

j=1
with open("article_list_new.pb","wb") as outfile:
    for i in range(len(z)):
        print(i)
        article_list_write =  Article_list()
        a= z[i].attrib["href"][2:]
        with open(a) as f:
            for line in f:
                j_content = json.loads(line)
                a = article_list_write.articles.add()
                a = json_to_proto(a, j_content)
                j+=1

        outfile.write(article_list_write.SerializeToString())
print("Total number of articles written into protocol buffer : {}",format(j))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
Total number of articles written into protocol buffer : {} 500631


In [None]:
infile = open("article_list.pb","rb")
some_article = Article_list()
some_article.ParseFromString(infile.read())
infile.close

i=1
for article in some_article.articles:
    i+=1
    
print("Total number of articles read from protocol buffer : {}",format(i))

In [7]:
size1 = os.path.getsize("article_list_new.pb")

In [12]:
print("Total size of dataset as json is {} bytes or {}Gb".format(size, size/1000**3))
print("Total size of dataset as protocal buffer is {} bytes or {}Gb".format(size1, size1/1000**3))
print("It saves {} Mb of space".format((size-size1)/1000**2))

Total size of dataset as json is 1746235959 bytes or 1.746235959Gb
Total size of dataset as protocal buffer is 1094383782 bytes or 1.094383782Gb
It saves 651.852177 Mb of space
