# JSON Schema Mapping

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from rich import print

In [3]:
def generate_json_schema(data):
    if isinstance(data, dict):
        schema = {}
        for key, value in data.items():
            schema[key] = generate_json_schema(value)
        return schema
    elif isinstance(data, list):
        if len(data) > 0:
            return [generate_json_schema(data[0])]
        else:
            return []
    else:
        return type(data).__name__

def print_json_schema(schema, indent=0):
    if isinstance(schema, dict):
        for key, value in schema.items():
            print(' ' * indent + f"{key}: {type(value).__name__}")
            print_json_schema(value, indent + 4)
    elif isinstance(schema, list):
        if len(schema) > 0:
            print(' ' * indent + "[0]: list")
            print_json_schema(schema[0], indent + 4)
        else:
            print(' ' * indent + "[]: list (empty)")
    else:
        print(' ' * indent + schema)

# # Generate and print the JSON schema
# schema = generate_json_schema(sierra_data[0])
# print_json_schema(schema)


In [8]:
overall_json = {
"gene":{
"name":"PR",
"__typename":"Gene"
},
"text":"I13V",
"__typename":"Mutation"
},
schema = generate_json_schema(overall_json)
print_json_schema(schema)

# Downstream Analysis

In [None]:
import json
import pandas as pd

# Assuming `data` is your loaded JSON data and it's a list of dicts

# Lists to hold the extracted information
mutation_data = []
drug_resistance_data = []
gene_sequence_data = []

# Iterate over each entry in the JSON data
for entry in sierra_data:
    # Extract input sequence details
    header = entry['inputSequence']['header']
    sha512 = entry['inputSequence']['SHA512']
    
    # Extract aligned gene sequences and mutations
    for gene_info in entry['alignedGeneSequences']:
        gene_name = gene_info['gene']['name']
        first_aa = gene_info['firstAA']
        last_aa = gene_info['lastAA']
        gene_length = gene_info['gene']['length']
        
        # Store gene sequence data
        gene_sequence_data.append({
            'Header': header,
            'Gene Name': gene_name,
            'First AA': first_aa,
            'Last AA': last_aa,
            'Gene Length': gene_length
        })
        
        # Extract mutation data
        for mutation in gene_info['mutations']:
            mutation_data.append({
                'Header': header,
                'Gene Name': gene_name,
                'Position': mutation['position'],
                'Amino Acids': mutation['AAs'],
                'Is Unusual': mutation['isUnusual'],
                'Primary Type': mutation['primaryType'],
                'Text Description': mutation['text']
            })
    
    # Extract drug resistance information
    for resistance in entry['drugResistance']:
        version = resistance['version']['text']
        for score in resistance['drugScores']:
            drug_resistance_data.append({
                'Header': header,
                'Version': version,
                'Drug Class': score['drugClass']['name'],
                'Drug Name': score['drug']['name'],
                'Drug Abbreviation': score['drug']['displayAbbr'],
                'Resistance Score': score['score'],
                'Resistance Level': score['level'],
                'Text Description': score['text']
            })

# Convert lists to pandas DataFrames
mutation_df = pd.DataFrame(mutation_data)
drug_resistance_df = pd.DataFrame(drug_resistance_data)
gene_sequence_df = pd.DataFrame(gene_sequence_data)

In [None]:
mutation_df.describe()

Unnamed: 0,Position
count,6185.0
mean,250.419725
std,162.446368
min,1.0
25%,112.0
50%,245.0
75%,377.0
max,559.0


In [None]:
drug_resistance_df.describe()

Unnamed: 0,Resistance Score,Resistance Level
count,1975.0,1975.0
mean,0.437975,1.034937
std,4.091541,0.285223
min,-10.0,1.0
25%,0.0,1.0
50%,0.0,1.0
75%,0.0,1.0
max,90.0,5.0


In [None]:
gene_sequence_df.describe()

Unnamed: 0,First AA,Last AA,Gene Length
count,237.0,237.0,237.0
mean,1.0,315.666667,315.666667
std,0.0,189.616972,189.616972
min,1.0,99.0,99.0
25%,1.0,99.0,99.0
50%,1.0,288.0,288.0
75%,1.0,560.0,560.0
max,1.0,560.0,560.0


In [None]:
drug_resistance_df.head()

Unnamed: 0,Header,Version,Drug Class,Drug Name,Drug Abbreviation,Resistance Score,Resistance Level,Text Description
0,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,HIVDB_9.6,PI,ATV,ATV/r,0.0,1,Susceptible
1,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,HIVDB_9.6,PI,DRV,DRV/r,0.0,1,Susceptible
2,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,HIVDB_9.6,PI,FPV,FPV/r,0.0,1,Susceptible
3,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,HIVDB_9.6,PI,IDV,IDV/r,0.0,1,Susceptible
4,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,HIVDB_9.6,PI,LPV,LPV/r,0.0,1,Susceptible


In [None]:
drug_resistance_df['Text Description'].value_counts()

Text Description
Susceptible                       1934
Potential Low-Level Resistance      26
Low-Level Resistance                 8
High-Level Resistance                6
Intermediate Resistance              1
Name: count, dtype: int64

In [None]:
mutation_df.head()

Unnamed: 0,Header,Gene Name,Position,Amino Acids,Is Unusual,Primary Type,Text Description
0,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,PR,13,V,False,Other,I13V
1,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,PR,20,I,False,Other,K20I
2,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,PR,35,Q,False,Other,E35Q
3,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,PR,36,I,False,Other,M36I
4,DQ168573.1 HIV-1 isolate 01NGPL0567 from Niger...,PR,41,K,False,Other,R41K


In [None]:
mutation_df['Text Description'].value_counts()[:20]

Text Description
K14R     107
M36I      78
V201I     77
I293V     76
T206S     75
L89M      75
S283G     75
L234I     72
D177E     71
G359S     71
V35T      71
I13V      70
R356K     70
K20I      70
R41K      69
S519N     69
D471E     69
E291D     68
T200A     68
H69K      67
Name: count, dtype: int64

# 1. Validation Results

In [None]:
data = sierra_data
def extract_validation_results(data):
    validation_results = []
    for item in data:
        for result in item.get('validationResults', []):
            validation_results.append({
                'level': result.get('level'),
                'message': result.get('message')
            })
    return pd.DataFrame(validation_results)

# Usage
validation_df = extract_validation_results(data)
validation_df.head()


Unnamed: 0,level,message
0,NOTE,This following APOBEC mutation was present in ...
1,NOTE,There is one stop codon in RT: RT:W383*.
2,NOTE,There is one stop codon in RT: RT:W402*.
3,SEVERE_WARNING,The following 5 APOBEC mutations were present ...
4,WARNING,There is one APOBEC-associated mutation at a d...


# Subtype Information

In [None]:
def extract_subtype_info(data):
    subtype_info = []
    for item in data:
        subtype_info.append({
            'bestMatchingSubtype': item.get('bestMatchingSubtype', {}).get('display', 'Unknown'),
            'mixturePcnt': item.get('mixturePcnt', 'Unknown'),
        })
    return pd.DataFrame(subtype_info)

# Usage
subtype_df = extract_subtype_info(data)
subtype_df.head()

Unnamed: 0,bestMatchingSubtype,mixturePcnt
0,Unknown,Unknown
1,Unknown,Unknown
2,Unknown,Unknown
3,Unknown,Unknown
4,Unknown,Unknown


# Frameshifts

In [None]:
def extract_frame_shifts(data):
    frame_shifts = []
    for item in data:
        for gene in item.get('alignedGeneSequences', []):
            for shift in gene.get('frameShifts', []):
                frame_shifts.append({
                    'gene': gene.get('gene', {}).get('name', ''),
                    'position': shift.get('position'),
                    'isInsertion': shift.get('isInsertion'),
                    'isDeletion': shift.get('isDeletion'),
                    'size': shift.get('size'),
                    'NAs': shift.get('NAs'),
                    'text': shift.get('text')
                })
    return pd.DataFrame(frame_shifts)

# Usage
frame_shifts_df = extract_frame_shifts(data)
frame_shifts_df

# Treatment-Selected Mutation

In [None]:
def extract_tsm(data):
    tsm_info = []
    for item in data:
        for gene in item.get('alignedGeneSequences', []):
            for mutation in gene.get('treatmentSelectedMutations', []):
                tsm_info.append({
                    'gene': gene.get('gene', {}).get('name', ''),
                    'mutation_text': mutation.get('text')
                })
    return pd.DataFrame(tsm_info)

# Usage
tsm_df = extract_tsm(data)
print(tsm_df.head())e


# Unusual Mutations

In [None]:
def extract_unusual_mutations(data):
    unusual_info = []
    for item in data:
        for gene in item.get('alignedGeneSequences', []):
            for mutation in gene.get('unusualMutations', []):
                unusual_info.append({
                    'gene': gene.get('gene', {}).get('name', ''),
                    'text': mutation.get('text')
                })
    return pd.DataFrame(unusual_info)

# Usage
unusual_df = extract_unusual_mutations(data)
unusual_df.head()

# Drug resistance mutations by subtypes

In [None]:
def extract_drug_resistance(data):
    resistance_info = []
    for item in data:
        for resistance in item.get('drugResistance', []):
            gene = resistance.get('gene', {}).get('name', '')
            for mutation_type in resistance.get('mutationsByTypes', []):
                mutation_type_name = mutation_type.get('mutationType')
                for mutation in mutation_type.get('mutations', []):
                    resistance_info.append({
                        'gene': gene,
                        'mutation_type': mutation_type_name,
                        'reference': mutation.get('reference'),
                        'position': mutation.get('position'),
                        'AAs': mutation.get('AAs'),
                        'text': mutation.get('text')
                    })
    return pd.DataFrame(resistance_info)

# Usage
drug_resistance_df = extract_drug_resistance(data)
print(drug_resistance_df.head())


# Partial Drug Resistance scores

In [None]:
def extract_partial_drug_scores(data):
    partial_scores = []
    for item in data:
        for resistance in item.get('drugResistance', []):
            for drug_score in resistance.get('drugScores', []):
                drug = drug_score.get('drug', {}).get('displayAbbr', '')
                for partial in drug_score.get('partialScores', []):
                    partial_scores.append({
                        'drug': drug,
                        'mutation_text': partial.get('mutations', {}).get('text', ''),
                        'partial_score': partial.get('score')
                    })
    return pd.DataFrame(partial_scores)

# Usage
partial_scores_df = extract_partial_drug_scores(data)
print(partial_scores_df.head())


AttributeError: 'list' object has no attribute 'get'