In [88]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import plotly.express as px
import plotly.graph_objects as go


In [76]:
#X_neg = pd.read_csv("features/pathogen_human_1500pairs_no3merfeats.csv", header=0)
#X_neg = pd.read_csv("features/HIV_7k_negs_no3mer_feats.csv", header=0)

other_corona = pd.read_csv("features/other_corona_no3merfeats.csv", header=0)
hiv = pd.read_csv("features/HIV_ppis_no3mer_feats.csv", header=0)
covid = pd.read_csv("features/krogan_ppis_no3mer.csv", header=0)
#negs = pd.read_csv("features/pathogen_human_1500pairs_no3merfeats.csv", header=0)

g = np.row_stack((other_corona,covid,hiv))
print(g.shape)

(1091, 1526)


In [52]:
#scaler = StandardScaler()
#g_norm = scaler.fit_transform(g)
g_norm = g

pca = PCA(n_components=100)
pca.fit(g_norm)
print(pca.explained_variance_ratio_[0:10])
g_pca = pca.transform(g_norm)
print(g_pca.shape)

[0.16503159 0.07454589 0.05607358 0.0486156  0.03914892 0.03578439
 0.03336795 0.02861312 0.02632115 0.02278172]
(1091, 100)


In [53]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
g_pca = tsne.fit_transform(g_pca)
print(g_pca.shape)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1091 samples in 0.002s...
[t-SNE] Computed neighbors for 1091 samples in 0.211s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1091
[t-SNE] Computed conditional probabilities for sample 1091 / 1091
[t-SNE] Mean sigma: 1.267928
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.897003
[t-SNE] KL divergence after 300 iterations: 0.532274
(1091, 2)


In [110]:
def get_df(pca1, pca2, filename, virusname):
    ppis = pd.read_csv(filename, header=0,index_col=0,sep='\t')
    str_array = np.asarray([format('%s:%s' % (ppi[1],ppi[3])) for ppi in ppis.values])
    df = pd.DataFrame({'PCA1': pca1, 'PCA2': pca2, 
                       'PPI': str_array, 'virus' : [virusname]*len(pca1)})
    return df

In [111]:
idx1 = range(other_corona.shape[0])
idx2 = range(max(idx1),max(idx1)+covid.shape[0])
idx3 = range(max(idx2),max(idx2)+hiv.shape[0])
#idx4 = range(max(idx3),max(idx3)+negs.shape[0])

df_hiv = get_df(g_pca[idx3,0], g_pca[idx3,1], "data/HIV_ppis_good_with_genenames.txt", 'hiv')
df_covid = get_df(g_pca[idx2,0], g_pca[idx2,1], "data/krogan_ppis_good_with_genenames.txt", 'covid')
df_corona = get_df(g_pca[idx1,0], g_pca[idx1,1], "data/othercorona_ppis_good_with_genenames.txt", 'other corona')


In [112]:
#fig = px.scatter(df_covid, x='PCA1',y='PCA2', hover_data=['PPI'], color='color')
#fig = px.scatter(df_corona, x='PCA1',y='PCA2', hover_data=['PPI'], color=['coral']*df_corona.shape[0])
df_all = df_hiv.append(df_covid)
df_all = df_all.append(df_corona)
fig = px.scatter(df_all, x='PCA1',y='PCA2', hover_data=['PPI'], color='virus')

fig.show()
fig.write_html('output.html')
#plt.savefig('TSNE_viruses.pdf', dpi=300, bbox_inches='tight')

In [None]:
plt.scatter(g_pca[idx3,0],g_pca[idx3,1], c='green', label='hiv')
plt.scatter(g_pca[idx2,0],g_pca[idx2,1], c='blue', label='sars-cov-2')
plt.scatter(g_pca[idx1,0],g_pca[idx1,1], c='coral', label='Other corona')
#plt.scatter(g_pca[idx4,0],g_pca[idx4,1], c='yellow', label='Negatives')
plt.legend()
plt.savefig('TSNE_viruses.pdf', dpi=300, bbox_inches='tight')