In [8]:
! pip install civicpy




In [9]:
from civicpy import civic
civic.update_cache()



# Features (genes)

In [10]:
features = civic.get_all_features(include_status = None )
len(features)

1022

In [11]:
import pandas as pd

# Assume 'features' is your list of objects

# Create an empty list to store data for the dataframe
data = []

# Iterate through each object in 'features'
for feature in features:
    # Extract relevant attributes from each object
    try:
        row = {
            'feature_id': feature.id,
            'feature_name': feature.name,
            'aliases': feature.aliases,
            'description': feature.description,
            'entrez_id': feature.entrez_id,
            #'variants': feature.variants,
            'variants_ids': [variant.id for variant in feature.variants] if feature.variants else [],
            'variants_names': [variant.name for variant in feature.variants] if feature.variants else [],
            #'site_link': feature.site_link,
            'source_ids': feature.source_ids,
            'sources': feature.sources,
            'sources_name': [source.name for source in feature.sources],
            #'type': feature.type,
            #'update': feature.update,


        }
        data.append(row)
    except AttributeError:
        # Handle cases where an attribute is missing
        pass
        #print(f"Skipping object due to missing attribute: {feature}")

# Create the pandas DataFrame
features_df = pd.DataFrame(data)
features_df = features_df.sort_values(by='feature_id').reset_index(drop=True)

# Display the DataFrame
features_df.head()

Unnamed: 0,feature_id,feature_name,aliases,description,entrez_id,variants_ids,variants_names,source_ids,sources,sources_name
0,32,MEF2D,[MEF2D],,4209,[3148],[MEF2D Rearrangement],[],[],[]
1,33,CSF1R,"[BANDDOS, C-FMS, CD115, CSF-1R, CSF1R, CSFR, F...",,1436,[3014],[Expression],[],[],[]
2,34,MGMT,[MGMT],,4255,"[2966, 85, 338, 1255]","[Expression, Promoter Methylation, RS16906252,...",[],[],[]
3,36,NRAS,"[ALPS4, CMNS, KRAS, N-ras, NCMS, NRAS, NRAS1, ...",Mutations in the RAS family of proteins have f...,4893,"[2550, 92, 596, 897, 878, 898, 1175, 895, 93, ...","[Amplification, G12, G12/G13, G12C, G12D, G12S...",[],[],[]
4,37,PIK3CA,"[CCM4, CLAPO, CLOVE, CWS5, HMH, MCAP, MCM, MCM...",PIK3CA is the most recurrently mutated gene in...,5290,"[212, 931, 3337, 2789, 1653, 3338, 1399, 3293,...","[Amplification, C420R, C604R, C971R, D350G, D3...",[],[],[]


In [12]:
features_df.to_csv('civic_all_features.csv', index=False)

In [13]:
features_df_simple = features_df[['feature_id', 'feature_name','aliases','description']]
features_df_simple.head()

Unnamed: 0,feature_id,feature_name,aliases,description
0,32,MEF2D,[MEF2D],
1,33,CSF1R,"[BANDDOS, C-FMS, CD115, CSF-1R, CSF1R, CSFR, F...",
2,34,MGMT,[MGMT],
3,36,NRAS,"[ALPS4, CMNS, KRAS, N-ras, NCMS, NRAS, NRAS1, ...",Mutations in the RAS family of proteins have f...
4,37,PIK3CA,"[CCM4, CLAPO, CLOVE, CWS5, HMH, MCAP, MCM, MCM...",PIK3CA is the most recurrently mutated gene in...


In [14]:
features_df_simple.to_csv('civic_all_features_simple.csv', index=False)

# Variants

In [15]:
variants = civic.get_all_variants(include_status = None, allow_cached=True)
len(variants)
dir(variants[0])

['aliases',
 'allele_registry_id',
 'clinvar_entries',
 'coordinates',
 'csq',
 'csq_alt',
 'entrez_id',
 'entrez_name',
 'feature',
 'feature_id',
 'gene',
 'groups',
 'hgvs_c',
 'hgvs_expressions',
 'hgvs_p',
 'id',
 'is_deletion',
 'is_insertion',
 'is_valid_for_vcf',
 'molecular_profiles',
 'name',
 'sanitized_name',
 'single_variant_molecular_profile',
 'single_variant_molecular_profile_id',
 'site_link',
 'subtype',
 'type',
 'types',
 'update',
 'variant_aliases',
 'variant_groups',
 'variant_types',
 'vcf_coordinates']

In [16]:
import pandas as pd

# Assume 'variants' is your list of objects

# Create an empty list to store data for the dataframe
data = []

# Iterate through each object in 'variants'
for variant in variants:
    # Extract relevant attributes from each object
    try:
        row = {
            'variant_name': variant.name,
            'aliases': variant.aliases,
            'variant_id': variant.id,
            #'allele_registry_id': variant.allele_registry_id,
            #'clinvar_entries': variant.clinvar_entries,
            #'coordinates': variant.coordinates,
            #'csq': variant.csq,
            #'csq_alt': variant.csq_alt,
            'entrez_id': variant.entrez_id,
            'entrez_name': variant.entrez_name,
            #'feature': variant.feature,
            'feature_id': variant.feature_id,
            'gene_id': variant.gene.id,
            'molecular_profiles_ids': [profile.id for profile in variant.molecular_profiles] if variant.molecular_profiles else [],
            'molecular_profiles_names': [profile.name for profile in variant.molecular_profiles] if variant.molecular_profiles else [],

            #'groups': variant.groups,
            #'hgvs_c': variant.hgvs_c,
            #'hgvs_expressions': variant.hgvs_expressions,
            #'hgvs_p': variant.hgvs_p,

            'is_deletion': variant.is_deletion,
            'is_insertion': variant.is_insertion,
            #'is_valid_for_vcf': variant.is_valid_for_vcf,

            #'sanitized_name': variant.sanitized_name,
            #'single_variant_molecular_profile': variant.single_variant_molecular_profile,
            #'single_variant_molecular_profile_id': variant.single_variant_molecular_profile_id,
            #'site_link': variant.site_link,
            #subtype': variant.subtype,
            #'type': variant.type,
            #'types': variant.types,
            #'update': variant.update,
            'variant_aliases': variant.variant_aliases,
            #'variant_groups': variant.variant_groups,
            #'variant_types': variant.variant_types,
            #'vcf_coordinates': variant.vcf_coordinates,
        }
        data.append(row)
    except AttributeError:
        # Handle cases where an attribute is missing
        pass#print(f"Skipping object due to missing attribute: {variant}")

# Create the pandas DataFrame
variants_df = pd.DataFrame(data).sort_values(by='variant_id').reset_index(drop=True)

# Display the DataFrame
variants_df.head()

Unnamed: 0,variant_name,aliases,variant_id,entrez_id,entrez_name,feature_id,gene_id,molecular_profiles_ids,molecular_profiles_names,is_deletion,is_insertion,variant_aliases
0,T315I,"[THR334ILE, RS121913459]",2,25,ABL1,4,4,"[4373, 2, 5363, 5365, 5370, 5373, 5374, 5375]","[BCR::ABL1 Fusion AND ABL1 T315I, ABL1 T315I, ...",False,False,"[THR334ILE, RS121913459]"
1,E255K,"[E274K, RS121913448]",3,25,ABL1,4,4,"[4431, 3, 4729]","[BCR::ABL1 Fusion AND ABL1 E255K, ABL1 E255K, ...",False,False,"[E274K, RS121913448]"
2,E17K,"[GLU17LYS, RS34409589]",4,207,AKT1,2,2,[4],[AKT1 E17K],False,False,"[GLU17LYS, RS34409589]"
3,F1174L,"[PHE1174LEU, RS863225281]",8,238,ALK,1,1,[8],[ALK F1174L],False,False,"[PHE1174LEU, RS863225281]"
4,R1275Q,"[ARG1275GLN, RS113994087]",9,238,ALK,1,1,"[9, 4235]","[ALK R1275Q, EML4::ALK Fusion AND ALK R1275Q]",False,False,"[ARG1275GLN, RS113994087]"


In [17]:
variants_df.to_csv('civic_all_variants.csv', index=False)

# Molecular profiles

In [18]:
molecular_profiles = civic.get_all_molecular_profiles(include_status = None, allow_cached=True)
len(molecular_profiles)

5066

In [19]:
import pandas as pd

# Assume 'molecular_profiles' is your list of objects

# Create an empty list to store data for the dataframe
data = []

# Iterate through each object in 'molecular_profiles'
for profile in molecular_profiles:
    # Extract relevant attributes from each object
    try:
        row = {
            'molecular_profile_id': profile.id,
            'molecular_profile_name': profile.name,
            'molecular_profile_score': profile.molecular_profile_score,
            'description': profile.description,
            'summary': profile.summary,
            'aliases': profile.aliases,
            #'assertions': profile.assertions,
            #'evidence': profile.evidence,
            #'evidence_items': profile.evidence_items,
            'evidence_items': [evidence.id for evidence in profile.evidence_items] if profile.evidence_items else [],
            'evidence_sources': profile.evidence_sources,

            #'parsed_name': profile.parsed_name,
            #'sanitized_name': profile.sanitized_name,
            #'site_link': profile.site_link,
            'source_ids': profile.source_ids,
            'sources': profile.sources,

            #'type': profile.type,
            #'update': profile.update,
            'variant_ids': profile.variant_ids,
            #'variants': profile.variants,
        }
        data.append(row)
    except AttributeError:
        # Handle cases where an attribute is missing
        pass#print(f"Skipping object due to missing attribute: {profile}")

# Create the pandas DataFrame
molecular_profiles_df = pd.DataFrame(data)
molecular_profiles_df = molecular_profiles_df.sort_values(by='molecular_profile_id').reset_index(drop=True)

# Display the DataFrame
molecular_profiles_df.head()

Unnamed: 0,molecular_profile_id,molecular_profile_name,molecular_profile_score,description,summary,aliases,evidence_items,evidence_sources,source_ids,sources,variant_ids
0,1,BCR::ABL1 Fusion,353.5,"The BCR-ABL fusion protein, commonly referred ...","The BCR-ABL fusion protein, commonly referred ...","[T(9;22)(Q34;Q11), BCR-ABL1, BCR-ABL]","[259, 260, 344, 220, 261, 11223, 11224, 11235,...","{Réa et al., 2021 (PUBMED 34407542), Bloomfiel...","[156, 1747]","[An et al., 2010 (PUBMED 20537386), Poch Marte...",[1]
1,2,ABL1 T315I,0.0,While the efficacy of imatinib has revolutioni...,While the efficacy of imatinib has revolutioni...,"[THR334ILE, RS121913459]","[11225, 11226, 6195, 6284]","{Bradeen et al., 2006 (PUBMED 16772610), Corte...",[],[],[2]
2,3,ABL1 E255K,0.0,While the efficacy of imatinib has revolutioni...,While the efficacy of imatinib has revolutioni...,"[E274K, RS121913448]","[7032, 6283]","{Bradeen et al., 2006 (PUBMED 16772610), Guilh...",[],[],[3]
3,4,AKT1 E17K,33.5,AKT1 E17K is a recurrent mutation that has bee...,AKT1 E17K is a recurrent mutation that has bee...,"[GLU17LYS, RS34409589]","[12181, 3039, 9007, 709, 231, 707, 4029]","{Hyman et al., 2017 (PUBMED 28489509), Beaver ...",[],[],[4]
4,5,EML4::ALK Fusion,48.0,The EML4-ALK fusion variant 1 consisting of AL...,The EML4-ALK fusion variant 1 consisting of AL...,[EML4-ALK],"[262, 1207, 9228, 7482, 1121, 1188, 1203, 1206...","{Kwak et al., 2010 (PUBMED 20979469), Choi et ...",[],[],[5]


In [20]:
molecular_profiles_df.to_csv('civic_all_molecular_profiles.csv', index=False)

In [21]:
molecular_profiles_simple = molecular_profiles_df[['molecular_profile_id','molecular_profile_name', 'summary', 'molecular_profile_score']]
molecular_profiles_simple.head()

Unnamed: 0,molecular_profile_id,molecular_profile_name,summary,molecular_profile_score
0,1,BCR::ABL1 Fusion,"The BCR-ABL fusion protein, commonly referred ...",353.5
1,2,ABL1 T315I,While the efficacy of imatinib has revolutioni...,0.0
2,3,ABL1 E255K,While the efficacy of imatinib has revolutioni...,0.0
3,4,AKT1 E17K,AKT1 E17K is a recurrent mutation that has bee...,33.5
4,5,EML4::ALK Fusion,The EML4-ALK fusion variant 1 consisting of AL...,48.0


In [22]:
molecular_profiles_simple.to_csv('civic_all_molecular_profiles_simple.csv', index=False)

# Evidence

In [24]:
evidence_items = civic.get_all_evidence()
len(evidence_items)

11100

In [25]:


# Create an empty list to store data for the dataframe
data = []

# Iterate through each object in 'evidence_items'
for item in evidence_items:
    # Extract relevant attributes from each object
    try:
        row = {
            'evidence_id': item.id,
            'evidence_name': item.name,
            'statement': item.statement,
            'description': item.description,
            'therapies': [therapy.name for therapy in item.therapies] if item.therapies else [],
            'disease': item.disease,
             'disease_id': item.disease_id,
            #'assertion_ids': item.assertion_ids,
            #'assertions': item.assertions,
            'molecular_profile_name': item.molecular_profile.name,
            'molecular_profile_id': item.molecular_profile_id,


            'evidence_direction': item.evidence_direction,
            'evidence_level': item.evidence_level,
            'evidence_type': item.evidence_type,
            'rating': item.rating,
            'significance': item.significance,


            #'phenotype_ids': item.phenotype_ids,
            #'phenotypes': item.phenotypes,

            #'site_link': item.site_link,
            'source': item.source,
            'source_id': item.source_id,

            'status': item.status,

            'therapy_ids': item.therapy_ids,
            'therapy_interaction_type': item.therapy_interaction_type,
            #'type': item.type,
            #'update': item.update,
            'variant_origin': item.variant_origin,
        }
        data.append(row)
    except AttributeError:
        # Handle cases where an attribute is missing
        print(f"Skipping object due to missing attribute: {item}")

# Create the pandas DataFrame
evidence_items_df = pd.DataFrame(data).sort_values(by='evidence_id').reset_index(drop=True)


# Display the DataFrame
evidence_items_df.head()

Unnamed: 0,evidence_id,evidence_name,statement,description,therapies,disease,disease_id,molecular_profile_name,molecular_profile_id,evidence_direction,evidence_level,evidence_type,rating,significance,source,source_id,status,therapy_ids,therapy_interaction_type,variant_origin
0,1,EID1,JAK2 V617F is not associated with lymphoid leu...,JAK2 V617F is not associated with lymphoid leu...,[],Lymphoid Leukemia (DOID 1037),3034.0,JAK2 V617F,64,SUPPORTS,B,DIAGNOSTIC,4.0,NEGATIVE,"Levine et al., 2005 (PUBMED 16081687)",51,accepted,[],,SOMATIC
1,2,EID2,GIST tumors harboring PDGFRA D842V mutation ar...,GIST tumors harboring PDGFRA D842V mutation ar...,[],Gastrointestinal Stromal Tumor (DOID 9253),2.0,PDGFRA D842V,99,SUPPORTS,B,DIAGNOSTIC,3.0,NEGATIVE,"Lasota et al., 2004 (PUBMED 15146165)",52,accepted,[],,SOMATIC
2,3,EID3,DNMT3A R882 mutations occur most often in de n...,DNMT3A R882 mutations occur most often in de n...,[],Acute Myeloid Leukemia (DOID 9119),3.0,DNMT3A R882,32,SUPPORTS,B,DIAGNOSTIC,2.0,POSITIVE,"LaRochelle et al., 2011 (PUBMED 22081665)",53,accepted,[],,SOMATIC
3,4,EID4,Young AML patients (<60 years old) with DNMT3A...,Young AML patients (<60 years old) with DNMT3A...,[],Acute Myeloid Leukemia (DOID 9119),3.0,DNMT3A R882,32,SUPPORTS,B,DIAGNOSTIC,3.0,POSITIVE,"Ribeiro et al., 2012 (PUBMED 22490330)",54,accepted,[],,SOMATIC
4,5,EID5,JAK2 V617F is associated with myeloid malignan...,JAK2 V617F is associated with myeloid malignan...,[],Chronic Myeloid Leukemia (DOID 8552),4.0,JAK2 V617F,64,SUPPORTS,B,DIAGNOSTIC,4.0,POSITIVE,"Levine et al., 2005 (PUBMED 16081687)",51,accepted,[],,SOMATIC


In [26]:
evidence_items_df.to_csv('civic_all_evidence.csv', index=False)

In [27]:
evidence_items_simple = evidence_items_df[['evidence_id', 'statement', 'therapies',
                                           'disease', 'evidence_direction', 'evidence_level',
                                           'evidence_type', 'rating', 'significance','variant_origin']]
evidence_items_simple.head()

Unnamed: 0,evidence_id,statement,therapies,disease,evidence_direction,evidence_level,evidence_type,rating,significance,variant_origin
0,1,JAK2 V617F is not associated with lymphoid leu...,[],Lymphoid Leukemia (DOID 1037),SUPPORTS,B,DIAGNOSTIC,4.0,NEGATIVE,SOMATIC
1,2,GIST tumors harboring PDGFRA D842V mutation ar...,[],Gastrointestinal Stromal Tumor (DOID 9253),SUPPORTS,B,DIAGNOSTIC,3.0,NEGATIVE,SOMATIC
2,3,DNMT3A R882 mutations occur most often in de n...,[],Acute Myeloid Leukemia (DOID 9119),SUPPORTS,B,DIAGNOSTIC,2.0,POSITIVE,SOMATIC
3,4,Young AML patients (<60 years old) with DNMT3A...,[],Acute Myeloid Leukemia (DOID 9119),SUPPORTS,B,DIAGNOSTIC,3.0,POSITIVE,SOMATIC
4,5,JAK2 V617F is associated with myeloid malignan...,[],Chronic Myeloid Leukemia (DOID 8552),SUPPORTS,B,DIAGNOSTIC,4.0,POSITIVE,SOMATIC


In [28]:
evidence_items_simple.to_csv('civic_all_evidence_simple.csv', index=False)

# Map

In [29]:
variants_df_map = variants_df[[ 'gene_id','variant_id', 'molecular_profiles_ids']]
# explode molecular_profiles_ids
variants_df_map = variants_df_map.explode('molecular_profiles_ids').sort_values(by=['gene_id', 'variant_id', 'molecular_profiles_ids']).reset_index(drop=True)
# raneme molecular_profiles_ids
variants_df_map = variants_df_map.rename(columns={'molecular_profiles_ids': 'molecular_profile_id'})
variants_df_map.head()

Unnamed: 0,gene_id,variant_id,molecular_profile_id
0,1,8,8
1,1,9,9
2,1,9,4235
3,1,171,4236
4,1,171,4237


In [30]:
civic_map = variants_df_map.merge(evidence_items_df[['evidence_id','molecular_profile_id']], on='molecular_profile_id', how='outer').sort_values(by=['gene_id', 'variant_id', 'molecular_profile_id','evidence_id']).reset_index(drop=True)
civic_map.head()

Unnamed: 0,gene_id,variant_id,molecular_profile_id,evidence_id
0,1.0,8.0,8,32.0
1,1.0,8.0,8,33.0
2,1.0,8.0,8,37.0
3,1.0,8.0,8,38.0
4,1.0,8.0,8,125.0


In [31]:
civic_map = civic_map.dropna(subset=['gene_id'])
civic_map.head()

Unnamed: 0,gene_id,variant_id,molecular_profile_id,evidence_id
0,1.0,8.0,8,32.0
1,1.0,8.0,8,33.0
2,1.0,8.0,8,37.0
3,1.0,8.0,8,38.0
4,1.0,8.0,8,125.0


In [32]:
civic_map.to_csv('civic_map.csv', index=False)