# Metadata Extraction
This notebook scraps the information from JGI html page of the project and returns a Python dict

In [1]:
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
import numpy as np
import pathlib

Load html result..

In [2]:
with open("../data/raw/Aspergillus whole-genus sequencing.html", 'r') as f:
    contents = f.read()
    
soup = BeautifulSoup(contents, 'html.parser')

Extracting metadata...

In [3]:
ctr = 0
metadata = {}
for child in soup.recursiveChildGenerator():
    if child.name == 'td':
        try:
            if child.attrs['data-title'] == "'Name'":
                name = child.find('div', {'dynamic' : 'result.homelink'})
                name = name.text.replace("Project:", "").strip()
                #print(name)
                metadata[ctr] = {'name' : name}
            if child.attrs['data-title'] == "'Resources'":
                resources = child.find('div', {'dynamic' : 'genomePortal'})
                resource_status = child.find('span', {'style' : 'font-weight: bold'}).text.strip()
                try:
                    resource_id = resources.text.strip()
                    resource_link = resources.find("a").attrs['href']
                except AttributeError:
                    resource_id = None
                    resource_link = None
                #print(resource_id, resource_status, resource_link)
                metadata[ctr]['resource_id'] = resource_id
                metadata[ctr]['resource_link'] = resource_link
            if child.attrs['data-title'] == "'JGI Info'":
                jgi_info = child.find('a', {'ng-bind-html' : 'result.jgiProjectId'})
                project_id = jgi_info.text.strip()
                project_link = jgi_info['href']
                folder_name = project_link.replace("https://genome.jgi.doe.gov/portal/pages/projectStatus.jsf?db=", "")
                related_projects = [i.text for i in child.find_all('div', {'dynamic' : 'project'}) if i.text != ""]
                #print(project_id, related_projects)
                metadata[ctr]['project_id'] = project_id
                metadata[ctr]['project_link'] = project_link
                metadata[ctr]['folder_name'] = folder_name
                metadata[ctr]['related_projects'] = related_projects
            if child.attrs['data-title'] == "'Status'":
                status = child.find("span", {"ng-bind-html" : "result.projectStatus"}).text.strip()
                date = child.find("span", {"ng-bind-html" : "result.projectStatusDate"}).text.strip()
                #print(status, date)
                metadata[ctr]['status'] = status
                metadata[ctr]['release_date'] = date
            if child.attrs['data-title'] == "'Contacts'":
                contacts = child.find('a', {'bo-href' : "'mailto:'+ stripHtmlTags(proposalPi.email)"})
                contact_name = contacts.text
                contact_mail = contacts['href'].replace("mailto:", "")
                metadata[ctr]['contact_name'] = contact_name
                metadata[ctr]['contact_mail'] = contact_mail
                ctr = ctr + 1
                #print("___")
        except KeyError:
            pass

## Cleaning

In [4]:
df = pd.DataFrame.from_dict(metadata).T

In [5]:
df.head(2)

Unnamed: 0,name,resource_id,resource_link,project_id,project_link,folder_name,related_projects,status,release_date,contact_name,contact_mail
0,Aspergillus acristatulus CBS 119.55 Annotated ...,Aspacri1,https://mycocosm.jgi.doe.gov/Aspacri1,1052020,https://genome.jgi.doe.gov/portal/pages/projec...,AspacrStandDraft_FD,"[SP 1052022, SP 1052021, AP 1052024, AP 105202...",Complete,2014-03-25,Scott E. Baker,scott.baker@pnnl.gov
1,Aspergillus aculeatinus CBS 121060 Annotated S...,Aspacu1,https://mycocosm.jgi.doe.gov/Aspacu1,1027254,https://genome.jgi.doe.gov/portal/pages/projec...,AspacuStandDraft_FD,"[SP 1027256, SP 1027255, AP 1027257]",Complete,2014-12-18,Scott E. Baker,scott.baker@pnnl.gov


In [6]:
# Path of the data
data_path = Path("../data/raw/").resolve()

# Checking if all assemblies are present
for i in df.index:
    folder_name = df.loc[i, "folder_name"]
    resource_id = df.loc[i, "resource_id"]
    masked_assembly_file = data_path / f"{folder_name}/Assembly/Mycocosm/Assembly/Assembled_scaffolds__masked_/{resource_id}_AssemblyScaffolds_Repeatmasked.fasta.gz"
    if masked_assembly_file.is_file():
        df.loc[i, "masked_assemblies"] = masked_assembly_file
        df.loc[i, "assembly_id_alias"] = resource_id
    else:
        print(f"Problem with {folder_name} : {resource_id}")
        test_path = data_path / f"{folder_name}/Assembly/Mycocosm/Assembly/Assembled_scaffolds__masked_/"
        assembly_files = [i for i in test_path.glob("*.fasta.gz")]
        if len(assembly_files) > 1:
            print("   multiple file detected")
            print([i.name for i in assembly_files])
            assembly_files = assembly_files
        else:
            try:
                correct_id = assembly_files[0].name.replace("_AssemblyScaffolds_Repeatmasked.fasta.gz", "")
                print(f"   Other id found: {resource_id} --> {correct_id}")
                df.loc[i, "assembly_id_alias"] = correct_id
            except:
                print("   WARNING: No assembly found")
                assembly_files = np.nan
        df.at[i, "masked_assemblies"] = assembly_files
        print("")

Problem with AspaurMinimDraft_3_FD : Aspaurful1_test
   Other id found: Aspaurful1_test --> Aspaurful1

Problem with AspcamStandDraft_FD : 2761201767
   Other id found: 2761201767 --> Aspcam1

Problem with AspcanStandDraft_FD : Aspcan1
   Other id found: Aspcan1 --> Aspcand1

Problem with AspeucStandDraft_FD : Aspeuc1

Problem with AsphetStandDraft_FD : Asphet1

Problem with AspheyMinimDraft_FD : Asphey1_P5
   Other id found: Asphey1_P5 --> Asphey1

Problem with AspintMinimDraft_FD : Aspint1
   multiple file detected
['14097814-Aspint1_AssemblyScaffolds_Repeatmasked.fasta.gz', '14010937-Aspint1_AssemblyScaffolds_Repeatmasked.fasta.gz']

Problem with AspisrMinimDraft_FD : Aspisr1_1
   Other id found: Aspisr1_1 --> Aspisr1

Problem with AsplacStandDraft_FD : Asplac1

Problem with AspmonMinimDraft_FD : Aspmon1
   multiple file detected
['14097818-Aspmon1_AssemblyScaffolds_Repeatmasked.fasta.gz', '13998657-Aspmon1_AssemblyScaffolds_Repeatmasked.fasta.gz', '14097839-Aspmon1_AssemblyScaffold

In [7]:
df.head(2)

Unnamed: 0,name,resource_id,resource_link,project_id,project_link,folder_name,related_projects,status,release_date,contact_name,contact_mail,masked_assemblies,assembly_id_alias
0,Aspergillus acristatulus CBS 119.55 Annotated ...,Aspacri1,https://mycocosm.jgi.doe.gov/Aspacri1,1052020,https://genome.jgi.doe.gov/portal/pages/projec...,AspacrStandDraft_FD,"[SP 1052022, SP 1052021, AP 1052024, AP 105202...",Complete,2014-03-25,Scott E. Baker,scott.baker@pnnl.gov,/datadrive/matin_other_projects/jgi_aspergillu...,Aspacri1
1,Aspergillus aculeatinus CBS 121060 Annotated S...,Aspacu1,https://mycocosm.jgi.doe.gov/Aspacu1,1027254,https://genome.jgi.doe.gov/portal/pages/projec...,AspacuStandDraft_FD,"[SP 1027256, SP 1027255, AP 1027257]",Complete,2014-12-18,Scott E. Baker,scott.baker@pnnl.gov,/datadrive/matin_other_projects/jgi_aspergillu...,Aspacu1


In [8]:
# Checking if all annotations are present
for i in df.index:
    folder_name = df.loc[i, "folder_name"]
    resource_id = df.loc[i, "resource_id"]
    gene_annotation_path = data_path / f"{folder_name}/Annotation/Mycocosm/Annotation/Filtered_Models___best__/Genes/"
    gene_annotations = [i for i in gene_annotation_path.glob("*.gff3.gz")]
    if len(gene_annotations) > 1:
        print(f"Problem with {folder_name} : {resource_id}")
        print("   multiple file detected")
        print("")
        gff3 = gene_annotations
    elif len(gene_annotations) == 1:
        gff3 = gene_annotations[0]
        correct_id = gff3.name.split("_")[0]
        if resource_id != correct_id:
            print(f"Problem with {folder_name} : {resource_id}")
            print(f"   Other id found: {resource_id} --> {correct_id}")
            print("")
            df.loc[i, "annotation_id_alias"] = correct_id
        else:
            df.loc[i, "annotation_id_alias"] = resource_id
    elif len(gene_annotations) == 0:
        print(f"Problem with {folder_name} : {resource_id}")
        print("   WARNING: No annotation found")
        print("")
        gff3 = np.nan
    df.at[i, "annotations"] = gff3

Problem with AspacuStandDraft_FD : Aspacu1

Problem with AspaurMinimDraft_3_FD : Aspaurful1_test
   Other id found: Aspaurful1_test --> Aspaurful1

Problem with AspbruStandDraft_FD : Aspbru1

Problem with AspcamStandDraft_FD : 2761201767
   Other id found: 2761201767 --> Aspcam1

Problem with AspcanStandDraft_FD : Aspcan1
   Other id found: Aspcan1 --> Aspcand1

Problem with AspcosStandDraft_FD : Aspcos1

Problem with AspellStandDraft_FD : Aspell1

Problem with AspeucStandDraft_FD : Aspeuc1

Problem with AspfijStandDraft_FD : Aspfij1

Problem with AsphetStandDraft_FD : Asphet1

Problem with AsphetMinimDraft_FD : Asphethal1
   multiple file detected

Problem with AspheyMinimDraft_FD : Asphey1_P5
   Other id found: Asphey1_P5 --> Asphey1

Problem with AsphomStandDraft_FD : Asphom1

Problem with AspibeStandDraft_FD : Aspibe1

Problem with AspimpStandDraft_FD : Aspimp1
   multiple file detected

Problem with AspindStandDraft_2_FD : Aspind2_1
   Other id found: Aspind2_1 --> Aspind2

Proble

In [9]:
df.head(2)

Unnamed: 0,name,resource_id,resource_link,project_id,project_link,folder_name,related_projects,status,release_date,contact_name,contact_mail,masked_assemblies,assembly_id_alias,annotation_id_alias,annotations
0,Aspergillus acristatulus CBS 119.55 Annotated ...,Aspacri1,https://mycocosm.jgi.doe.gov/Aspacri1,1052020,https://genome.jgi.doe.gov/portal/pages/projec...,AspacrStandDraft_FD,"[SP 1052022, SP 1052021, AP 1052024, AP 105202...",Complete,2014-03-25,Scott E. Baker,scott.baker@pnnl.gov,/datadrive/matin_other_projects/jgi_aspergillu...,Aspacri1,Aspacri1,/datadrive/matin_other_projects/jgi_aspergillu...
1,Aspergillus aculeatinus CBS 121060 Annotated S...,Aspacu1,https://mycocosm.jgi.doe.gov/Aspacu1,1027254,https://genome.jgi.doe.gov/portal/pages/projec...,AspacuStandDraft_FD,"[SP 1027256, SP 1027255, AP 1027257]",Complete,2014-12-18,Scott E. Baker,scott.baker@pnnl.gov,/datadrive/matin_other_projects/jgi_aspergillu...,Aspacu1,,


In [16]:
# How many samples in total?
len(df)

245

In [17]:
def evaluate_jgi(df, column_name):
    null = []
    multiple = []
    good = []
    for i in df.index:
        item = df.loc[i, column_name]
        if type(item) == pathlib.PosixPath:
            assert item.is_file()
            good.append(i)
        elif type(item) == list:
            for link in item:
                assert link.is_file()
            multiple.append(i)
        else:
            null.append(i)
    return good, multiple, null

# How many assemblies can we process?
good_assembly, multiple_assembly, null_assembly = evaluate_jgi(df, "masked_assemblies")
print(f"Total annotated samples: {len(good_assembly + multiple_assembly + null_assembly )}. There are {len(good_assembly)} good assemblies, {len(multiple_assembly)} samples with multiple assemblies, and {len(null_assembly)} samples with no assembly")

# How many annotations can we process?
good_annot, multiple_annot, null_annot = evaluate_jgi(df, "annotations")
print(f"Total annotated samples: {len(good_annot + multiple_annot + null_annot )}.There are {len(good_annot)} good annotation, {len(multiple_annot)} samples with multiple annotation, and {len(null_annot)} samples with no annotation")

Total annotated samples: 245. There are 220 good assemblies, 15 samples with multiple assemblies, and 10 samples with no assembly
Total annotated samples: 245.There are 210 good annotation, 13 samples with multiple annotation, and 22 samples with no annotation


In [30]:
# How many samples have good assembly and annotation?
good_intersect = set(good_annot).intersection(set(good_assembly))
print(f"We have {len(good_intersect)} samples with both single annotation and assembly")

We have 198 samples with both single annotation and assembly


In [31]:
# How many samples have good assembly and annotation?
not_null_intersect = set(good_annot + multiple_annot).intersection(set(good_assembly + multiple_assembly))
print(f"We have {len(not_null_intersect)} samples which is not null")

We have 223 samples which is not null


In [37]:
clean_df = df.loc[list(good_intersect), :]
for i in clean_df.index:
    row = df.loc[i, :]
    if row.resource_id == row.assembly_id_alias == row.annotation_id_alias:
        pass
    else:
        print(i, row.resource_id, row.assembly_id_alias, row.annotation_id_alias)

100 Aspind2_1 Aspind2_1 Aspind2
123 Aspmul1_1 Aspmul1_1 Aspmul1
127 Aspnav1_1 Aspnav1_1 Aspnav1
147 Aspoli1_1 Aspoli1_1 Aspoli1
178 Asprec1_1 Asprec1_1 Asprec1
193 Aspsim1_1 Aspsim1_1 Aspsim1
195 Aspspe1_1 Aspspe1_1 Aspspe1
198 Aspstel1_1 Aspstel1_1 Aspstel1
200 Aspstec1_1 Aspstec1_1 Aspstec1
226 Aspund1_1 Aspund1_1 Aspund1


In [42]:
len(clean_df.project_id.unique())

198

In [43]:
len(clean_df.annotation_id_alias.unique())

198

In [45]:
table_out = Path("../tables")
table_out.mkdir()
clean_df.to_csv("../tables/clean_metadata.csv")