In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz  # For fuzzy string comparison



In [3]:
df_Bloom_etal = pd.read_json('../viral_spectra/data/Bloom_etal/other_virus_spectra_Bloom_etal.json', orient='index')

In [4]:
dfms12Bloom_etal = df_Bloom_etal['mutation_spectrum'].apply(pd.Series)
dfms12Bloom_etal.columns = dfms12Bloom_etal.columns.str.replace('to', '>')
dfms12Bloom_etal = dfms12Bloom_etal.div(dfms12Bloom_etal.sum(axis=1), axis=0)

In [5]:
dfms12Bloom_etal = dfms12Bloom_etal.reindex(sorted(dfms12Bloom_etal.columns), axis=1)

In [6]:
Bloom_etal_taxname = ['Influenza A H3N2', 'Influenza A H1N1', 'Influenza B Victoria', 
                 'Influenza B Yamagata', 'RSV-A', 'RSV-B', 'Enterovirus D68', 
                 'Enterovirus A71', 'Dengue virus 1', 'Dengue virus 2', 
                 'Dengue virus 3', 'Dengue virus 4', 'West Nile virus']

In [7]:
dfms12Bloom_etal['taxname'] = Bloom_etal_taxname
dfms12Bloom_etal['df'] = 'Bloom_etal'
dfms12Bloom_etal = dfms12Bloom_etal.reset_index(drop=True)

In [8]:
dfms12Bloom_etal.head()

Unnamed: 0,A>C,A>G,A>T,C>A,C>G,C>T,G>A,G>C,G>T,T>A,T>C,T>G,taxname,df
0,0.013002,0.131938,0.018434,0.067506,0.002099,0.196083,0.335605,0.004628,0.039958,0.019876,0.157439,0.013431,Influenza A H3N2,Bloom_etal
1,0.01563,0.12262,0.022171,0.066162,0.004931,0.194899,0.325181,0.005561,0.045243,0.025369,0.156519,0.015716,Influenza A H1N1,Bloom_etal
2,0.010306,0.120362,0.011133,0.060726,0.002884,0.21102,0.379445,0.002609,0.040934,0.016327,0.133855,0.0104,Influenza B Victoria,Bloom_etal
3,0.007849,0.117671,0.010157,0.062121,0.003053,0.22391,0.366943,0.003405,0.036883,0.013412,0.144512,0.010085,Influenza B Yamagata,Bloom_etal
4,0.007703,0.070279,0.038923,0.053017,0.002418,0.290197,0.334728,0.000332,0.040141,0.033842,0.120245,0.008174,RSV-A,Bloom_etal


In [9]:
df_nemu_ms12 = pd.read_csv('../viral_spectra/data/ms12syn_all_virus.csv')
dfms12 = df_nemu_ms12.pivot(index='taxid', columns='Mut', values='MutSpec').reset_index()

In [10]:
df_taxname = pd.read_csv('../viral_spectra/data/taxid_virus_type.csv', index_col='Unnamed: 0')
df_taxname['taxname'] = df_taxname['species'].str.split('__').str[0]

In [11]:
dfms12 = dfms12.merge(df_taxname[['taxid','taxname']], how='left')
dfms12['df'] = 'nemu'
dfms12 = dfms12.drop(columns=['taxid'])

In [12]:
dfms12.head()

Unnamed: 0,A>C,A>G,A>T,C>A,C>G,C>T,G>A,G>C,G>T,T>A,T>C,T>G,taxname,df
0,0.109127,0.117342,0.073464,0.060217,0.01169,0.14945,0.196651,0.026968,0.020779,0.036932,0.132424,0.064957,Hepatitis B virus,nemu
1,0.034784,0.086621,0.027108,0.044784,0.006846,0.339187,0.234664,0.0,0.102792,0.009995,0.095861,0.017356,Rice black streaked dwarf virus,nemu
2,0.012922,0.127692,0.009847,0.022595,0.001792,0.379747,0.123494,0.003866,0.03189,0.022814,0.249177,0.014165,West Nile virus,nemu
3,0.008852,0.143715,0.014274,0.079285,0.0,0.198377,0.334457,0.0,0.058034,0.031512,0.126587,0.004907,Influenza A virus,nemu
4,0.009244,0.183506,0.025201,0.014157,0.002805,0.243095,0.227613,0.012168,0.021287,0.031341,0.224274,0.005309,Norwalk virus,nemu


In [19]:
# Threshold for fuzzy comparison
SIMILARITY_THRESHOLD = 73

# List of results 
results = []

# Looping through the rows of the first dataframe
for i, row1 in dfms12.iterrows():
    name1 = row1['taxname']
    
    # Search for similar names in the second dataframe
    for j, row2 in dfms12Bloom_etal.iterrows():
        name2 = row2['taxname']
        
        # Calculating name similarity with fuzzywuzzy
        similarity_score = fuzz.ratio(name1, name2)
        
        # If the similarity is above the threshold, we compare the strings
        if similarity_score >= SIMILARITY_THRESHOLD:
            # Obtain vectors for comparison
            vector1 = row1[['A>C', 'A>G', 'A>T', 
                            'C>A', 'C>G', 'C>T', 
                            'G>A', 'G>C', 'G>T', 
                            'T>A','T>C', 'T>G']].values.reshape(1, -1)
            vector2 = row2[['A>C', 'A>G', 'A>T', 
                            'C>A', 'C>G', 'C>T', 
                            'G>A', 'G>C', 'G>T', 
                            'T>A','T>C', 'T>G']].values.reshape(1, -1)
            
            # Calculate cosine similarity
            cosine_sim = cosine_similarity(vector1, vector2)[0][0]
            
            # Save result
            results.append({
                'taxname_nemu': name1,
                'taxname_Bloom_etal': name2,
                'fuzzy_similarity': similarity_score,
                'cosine_similarity': cosine_sim
            })

# Make dataframe
results_df = pd.DataFrame(results)

# print(results_df)

In [20]:
results_df = results_df.drop(index=[7, 9, 10, 12]).reset_index(drop=True)
results_df.to_csv('./data/nemu_results_qc.csv', index=False)
results_df

Unnamed: 0,taxname_nemu,taxname_Bloom_etal,fuzzy_similarity,cosine_similarity
0,West Nile virus,West Nile virus,100,0.993924
1,Influenza A virus,Influenza A H3N2,73,0.995424
2,Influenza A virus,Influenza A H1N1,73,0.994819
3,Dengue virus,Dengue virus 1,92,0.999392
4,Dengue virus,Dengue virus 2,92,0.996908
5,Dengue virus,Dengue virus 3,92,0.997913
6,Dengue virus,Dengue virus 4,92,0.996073
7,Enterovirus A,Enterovirus A71,93,0.996215
8,Enterovirus D,Enterovirus D68,93,0.998397
