# 2.2 Comparison analysis

In [18]:
# Import packages
import pandas as pd
import numpy as np
import cobra

In [19]:
# Load the orthology matrix
ortho_matrix = pd.read_csv('../tables/ortho_matrix.csv')
ortho_matrix.set_index('Unnamed: 0', inplace=True)

ortho_matrix

Unnamed: 0_level_0,CU651637,CP001855,CP002167,CP000468,CP000946,CP000819,CP001665,AM946981,CP001509,CP001396,...,CU928163,CP000243,CP001063,CP000036,CP000034,CP001383,AE014073,AE005674,CP000266,CP000038
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CIW80_00005,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CIW80_00010,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
CIW80_00015,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
CIW80_00020,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
CIW80_00025,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CIW80_25775,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CIW80_25780,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
CIW80_25785,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CIW80_25790,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


# 1. Common & unique genes

In [20]:
all_genes = len(ortho_matrix)

# Compare the number of genes shared by all strains and unique to EcN
comparison_genes = ortho_matrix
comparison_genes['sum'] = comparison_genes.sum(axis=1)
print('Number of common genes is:', len(comparison_genes[comparison_genes['sum'] == 55]))
print('Number of unique genes is:', len(comparison_genes[comparison_genes['sum'] == 0]))

Number of common genes is: 1783
Number of unique genes is: 196


# 2. Genomic similarity

In [21]:
similarity = ortho_matrix.sum()

similarity_df = pd.DataFrame(columns=['NCBI ID','Strain', 'Similarity', '% Similarity'])
similarity_df.set_index('NCBI ID', inplace=True)

# Get the similarity between EcN and the reference strains
for strain in similarity.sort_values(ascending=False).index.tolist():
    if strain is not 'sum': #Don't add the total sum to the index
        similarity_df.loc[strain, 'Similarity'] = similarity.loc[strain]
        similarity_df.loc[strain, '% Similarity'] = similarity.loc[strain]/all_genes

# Add the strain names to the similairty_df
strain_info = pd.read_csv('../tables/strain_info.csv', usecols=['Strain', 'Pathotype', 'NCBI ID', 'Taxon ID','ORFs','Model Name','Genes', 'Reactions', 'Metabolites'])
strain_info.set_index('Strain', inplace=True)

for ID in similarity_df.index.values:
    similarity_df.loc[ID, 'Strain'] = strain_info.index[strain_info['NCBI ID'] == ID].tolist()[0]

similarity_df

Unnamed: 0_level_0,Strain,Similarity,% Similarity
NCBI ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AE014075,Escherichia coli CFT073,4392.0,0.870565
CP001671,Escherichia coli ABU 83972,4378.0,0.86779
CP000243,Escherichia coli UTI89,4223.0,0.837066
CP000247,Escherichia coli 536,4199.0,0.832309
CU928161,Escherichia coli S88,4172.0,0.826957
CP002167,Escherichia coli UM146,4139.0,0.820416
CP001969,Escherichia coli IHE3034,4072.0,0.807136
AP009378,Escherichia coli SE15,4052.0,0.803171
CP002797,Escherichia coli NA114,4041.0,0.800991
CU928162,Escherichia coli ED1a,4005.0,0.793855


# 3. Number of reactions in common

In [25]:
### Get an overview of the strains

# Load EcN model
EcN_ID = 'CP022686.1'
EcN_model=cobra.io.load_json_model('../data/models/%s_cur_4.7.json'%EcN_ID)

# Load strain info to get the model for each strain
strain_info = pd.read_csv('../tables/strain_info.csv', usecols=[1,3,6])
strain_info.iloc[19]['Model Name'] = 'iML1515' # Change the MG1655 model to iML1515
strain_info.head()

Unnamed: 0,Strain,NCBI ID,Model Name
0,Escherichia coli LF82,CU651637,iLF82_1304
1,Escherichia coli O83:H1 str. NRG 857C,CP001855,iNRG857_1313
2,Escherichia coli UM146,CP002167,iUMN146_1321
3,Escherichia coli APEC O1,CP000468,iAPECO1_1312
4,Escherichia coli ATCC 8739,CP000946,iEcolC_1368


In [54]:
### Check for every reaction the presence in the reference strains

strain_rxn_df = pd.DataFrame()

# Loop over the strains and add reactions
for strain in strain_info['NCBI ID']:
    CurrentStrain = strain

    # Load models
    CurrentModel = strain_info.loc[strain_info['NCBI ID'] == CurrentStrain, 'Model Name'].values[0]
    current_model = cobra.io.load_json_model('../data/models/%s.json'%CurrentModel)
    print("Checking strain: ", CurrentStrain,', model:', current_model.id)
 
    # Add reactions
    for rxn in EcN_model.reactions:
        try:
            current_model.reactions.get_by_id(rxn.id)
            strain_rxn_df.loc[rxn.id, CurrentStrain] = 1.0
        except:
            strain_rxn_df.loc[rxn.id, CurrentStrain] = 0.0
            
strain_rxn_df

Checking strain:  CU651637 , model: iLF82_1304
Checking strain:  CP001855 , model: iNRG857_1313
Checking strain:  CP002167 , model: iUMN146_1321
Checking strain:  CP000468 , model: iAPECO1_1312
Checking strain:  CP000946 , model: iEcolC_1368
Checking strain:  CP000819 , model: iECB_1328
Checking strain:  CP001665 , model: iECBD_1354
Checking strain:  AM946981 , model: iB21_1397
Checking strain:  CP001509 , model: iECD_1391
Checking strain:  CP001396 , model: iBWG_1329
Checking strain:  CP001637 , model: iEcDH1_1363
Checking strain:  AP012030 , model: iECDH1ME8569_1439
Checking strain:  CU928162 , model: iECED1_1282
Checking strain:  CP000802 , model: iEcHS_1320
Checking strain:  CU928160 , model: iECIAI1_1343
Checking strain:  CP002516 , model: iEKO11_1354
Checking strain:  AP009240 , model: iECSE_1348
Checking strain:  AP009378 , model: iECSF_1327
Checking strain:  CP000948 , model: iECDH10B_1368
Checking strain:  U00096 , model: iML1515
Checking strain:  AP009048 , model: iY75_1357
C

Unnamed: 0,CU651637,CP001855,CP002167,CP000468,CP000946,CP000819,CP001665,AM946981,CP001509,CP001396,...,CU928163,CP000243,CP001063,CP000036,CP000034,CP001383,AE014073,AE005674,CP000266,CP000038
ALATA_D2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
SHCHD2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CPPPGO,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
GTHOr,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
DHORD5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CLBTinter3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CLBTNtex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EX_clbtn_e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MTRK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
# Total number of reactions in common
strain_sum_df = strain_rxn_df.sum().rename('rxn_sum')

# Add total number of common reactions to strain overview
strain_df = pd.merge(strain_info, strain_sum_df, left_on='NCBI ID', right_index=True)
strain_df.sort_values('rxn_sum', ascending=False, inplace=True)
strain_df

Unnamed: 0,Strain,NCBI ID,Model Name,rxn_sum
19,Escherichia coli str. K-12 substr. MG1655,U00096,iML1515,2611.0
22,Escherichia coli W,CP002967,iWFL_1372,2603.0
21,Escherichia coli W CP002185,CP002185,iECW_1372,2603.0
15,Escherichia coli KO11FL,CP002516,iEKO11_1354,2601.0
31,Escherichia coli O26:H11 str. 11368,AP010953,iECO26_1355,2598.0
37,Escherichia coli UMNK88,CP002729,iUMNK88_1353,2594.0
4,Escherichia coli ATCC 8739,CP000946,iEcolC_1368,2593.0
16,Escherichia coli SE11,AP009240,iECSE_1348,2590.0
14,Escherichia coli IAI1,CU928160,iECIAI1_1343,2589.0
40,Escherichia coli 536,CP000247,iECP_1309,2587.0


In [90]:
len(EcN_model.reactions)

2869

### Comparison with iML1515

In [59]:
# Load the iML1515 model
CurrentStrain = 'iML1515'
current_model = cobra.io.load_json_model('../data/models/iML1515.json')

# Create a dataframe with the presence/absence of EcN reactions in the iML1515 model 
iML_df = pd.DataFrame()

for rxn in EcN_model.reactions:
        try:
            current_model.reactions.get_by_id(rxn.id)
            iML_df.loc[rxn.id, CurrentStrain] = 1.0
        except:
            iML_df.loc[rxn.id, CurrentStrain] = 0.0

print('Number of common reactions is', iML_df.sum().values,'which is the same as strain_df')
            
iML_df

Number of common reactions is [2611.] which is the same as strain_df


Unnamed: 0,iML1515
ALATA_D2,1.0
SHCHD2,1.0
CPPPGO,1.0
GTHOr,1.0
DHORD5,1.0
...,...
CLBTinter3,0.0
CLBTNtex,0.0
EX_clbtn_e,0.0
MTRK,0.0
