In [88]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import plotly.express as px
import plotly.graph_objects as go


In [156]:
viruses = ['other_corona','HIV','sars_cov2','ebola']
feats = list()
indices = list([0])
for v in viruses:
    df = pd.read_csv(format("features/%s_ppis_no3merfeats.csv" % v), header=0)
    indices.append(indices[-1]+df.shape[0])
    feats.append(np.asarray(df))
feats = np.vstack(feats)
print(feats.shape)
print(indices)

(1312, 1526)
[0, 143, 761, 1091, 1312]


In [157]:
#scaler = StandardScaler()
#g_norm = scaler.fit_transform(g)
f_norm = feats

pca = PCA(n_components=100)
pca.fit(f_norm)
print(pca.explained_variance_ratio_[0:10])
f_pca = pca.transform(f_norm)
print(f_pca.shape)

[0.14151525 0.07578349 0.05215993 0.04190788 0.03493226 0.03061226
 0.02932095 0.02744374 0.02546536 0.02356344]
(1312, 100)


In [158]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
f_pca = tsne.fit_transform(f_pca)
print(f_pca.shape)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1312 samples in 0.002s...
[t-SNE] Computed neighbors for 1312 samples in 0.292s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1312
[t-SNE] Computed conditional probabilities for sample 1312 / 1312
[t-SNE] Mean sigma: 1.286016
[t-SNE] KL divergence after 250 iterations with early exaggeration: 55.381516
[t-SNE] KL divergence after 300 iterations: 0.558143
(1312, 2)


In [159]:
gene_names = pd.read_csv("data/uniprotid_gene_name.txt",header=0,sep='\t')
gene_names = gene_names.set_index('Protein_ID')
gene_names.head()
gene_names.loc['A0A024A4F8','Gene_name']

'gL'

In [160]:
def get_df(pca, filename, virusname):
    ppis = pd.read_csv(filename, header=0,index_col=0)
    str_array = np.asarray([format('%s:%s' % (gene_names.loc[ppi[0],'Gene_name'],
                                              gene_names.loc[ppi[1],'Gene_name'])) for ppi in ppis.values])
    df = pd.DataFrame({'PCA1': pca[:,0], 'PCA2': pca[:,1], 
                       'PPI': str_array, 'virus' : [virusname]*pca.shape[0]})
    return df

In [161]:
df_all = pd.DataFrame()
for v in range(len(viruses)):
    wrap_df = get_df(f_pca[range(indices[v],indices[v+1]),:], 
                     format("data/%s_ppis_good.csv" % viruses[v]), viruses[v])
    df_all = df_all.append(wrap_df)

print(df_all.shape)

fig = px.scatter(df_all, x='PCA1',y='PCA2', hover_data=['PPI'], color='virus')
#fig.show()
fig.write_html('ppi_plot.html')

#plt.scatter(data=df_all, x='PCA1',y='PCA2') #, c='virus')

(1312, 4)


In [None]:
######### OLD CODE ###########
#plt.scatter(g_pca[idx3,0],g_pca[idx3,1], c='green', label='hiv')
#plt.scatter(g_pca[idx2,0],g_pca[idx2,1], c='blue', label='sars-cov-2')
#plt.scatter(g_pca[idx1,0],g_pca[idx1,1], c='coral', label='Other corona')
#plt.scatter(g_pca[idx4,0],g_pca[idx4,1], c='yellow', label='Negatives')
#plt.legend()
#plt.savefig('TSNE_viruses.pdf', dpi=300, bbox_inches='tight')