This implementation was tested on TIPs-VF for truncation and fragmentation analysis. For more information on TIPs-VF please visit the [TIPs portal](https://tips.logiacommunications.com/) or [GitHub](https://github.com/mahvin92/TIPs-VF) resources.

In [None]:
# Import your TIPs-VF encoded file
from google.colab import files
uploaded = files.upload()


In [None]:
# Load the necessary libraries
!pip install umap-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from tensorflow.keras import layers, models
import seaborn as sns

In [None]:

# Run the neural network and data visualization

file_path = 'chr genes TIPS_FR.tsv' # file name should match your upload
data = pd.read_csv(file_path, sep='\t')

print(data.head()) # for manual inspection

def process_column(col):
    return np.array([float(x) for x in col.split(',')])

X_cos_sim = data['cos_sim'].apply(process_column)
X_theta_s = data['theta_s'].apply(process_column)

X_combined = np.array([np.concatenate([x_cos_sim, x_theta_s]) for x_cos_sim, x_theta_s in zip(X_cos_sim, X_theta_s)])

print(f"Shape of the data: {X_combined.shape}")

X_scaled = StandardScaler().fit_transform(X_combined)

labels = data['Gene_name']

def color_code_gene_name(gene_name):
    if 'NC_000001' in gene_name:
        return 0  # Assign color code 0
    elif 'NC_000002' in gene_name:
        return 1  # Assign color code 1
    elif 'NC_000003' in gene_name:
        return 2  # Assign color code 2
    else:
        return 3  # Assign color code for other cases which should be irrelevant when running the TIPs-VF data

color_codes = labels.apply(color_code_gene_name)


def build_autoencoder(input_dim, embedding_dim=10): # building the neural network model
    encoder_input = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(encoder_input)
    x = layers.Dense(64, activation='relu')(x)
    encoded = layers.Dense(embedding_dim, activation='linear')(x)

    x = layers.Dense(64, activation='relu')(encoded)
    x = layers.Dense(128, activation='relu')(x)
    decoded = layers.Dense(input_dim, activation='linear')(x)

    autoencoder = models.Model(encoder_input, decoded)

    return autoencoder

autoencoder_model = build_autoencoder(X_scaled.shape[1], embedding_dim=10) # Initialize the model

autoencoder_model.compile(optimizer='adam', loss='mse', metrics=['mae']) # MSE reconstruction

autoencoder_model.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=1) # Training phase

encoder = models.Model(inputs=autoencoder_model.input, outputs=autoencoder_model.layers[3].output) # Extract embeddings
embeddings = encoder.predict(X_scaled)

def plot_embeddings(embeddings, color_codes, title): # Embedding visualization
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
    scatter = ax.scatter(embeddings[:, 0], embeddings[:, 1], c=color_codes, cmap='viridis', alpha=0.5, s=200)
    ax.set_title(title)

    cbar = plt.colorbar(scatter, ax=ax, label='Gene Categories')
    cbar.set_ticks([0, 1, 2, 3])
    cbar.set_ticklabels(['NC_000001', 'NC_000002', 'NC_000003', 'Others'])
    plt.show()

pca = PCA(n_components=2) # PCA Visualization
pca_result = pca.fit_transform(embeddings)
plot_embeddings(pca_result, color_codes, 'PCA Embeddings')

tsne = TSNE(n_components=2, random_state=42) # t-SNE Visualization
tsne_result = tsne.fit_transform(embeddings)
plot_embeddings(tsne_result, color_codes, 't-SNE Embeddings')

umap_model = umap.UMAP(n_components=2, random_state=42) # UMAP Visualization
umap_result = umap_model.fit_transform(embeddings)
plot_embeddings(umap_result, color_codes, 'UMAP Embeddings')


In [None]:
# Posthoc analyses (pairwise distances)

from scipy.spatial.distance import pdist, squareform

# 1. Compute the pairwise distances for PCA embeddings
pca_distances = squareform(pdist(pca_result, metric='euclidean'))
plt.figure(figsize=(12, 10))
sns.heatmap(pca_distances, cmap='viridis', cbar=True)
plt.title('Heatmap of PCA Embedding Distances')
plt.xlabel('Embedding Index')
plt.ylabel('Embedding Index')
plt.show()

# 2. Compute the pairwise distances for UMAP embeddings
umap_distances = squareform(pdist(umap_result, metric='euclidean'))
plt.figure(figsize=(12, 10))
sns.heatmap(umap_distances, cmap='viridis', cbar=True)
plt.title('Heatmap of UMAP Embedding Distances')
plt.xlabel('Embedding Index')
plt.ylabel('Embedding Index')
plt.show()

# 3. Compute the pairwise distances for t-SNE embeddings
tsne_distances = squareform(pdist(tsne_result, metric='euclidean'))
plt.figure(figsize=(12, 10))
sns.heatmap(tsne_distances, cmap='viridis', cbar=True)
plt.title('Heatmap of t-SNE Embedding Distances')
plt.xlabel('Embedding Index')
plt.ylabel('Embedding Index')
plt.show()

In [None]:
# Posthoc analyses (with dendrograms)

from scipy.cluster.hierarchy import linkage, dendrogram

# 1. Compute the pairwise distances and plot for PCA embeddings
pca_distances = squareform(pdist(pca_result, metric='euclidean'))
plt.figure(figsize=(12, 10))
sns.heatmap(pca_distances, cmap='viridis', cbar=True)
plt.title('Heatmap of PCA Embedding Distances')
plt.xlabel('Embedding Index')
plt.ylabel('Embedding Index')
plt.show()

# Dendrogram for PCA embeddings
plt.figure(figsize=(15, 10))
linkage_matrix_pca = linkage(pca_result, method='ward')
dendrogram(linkage_matrix_pca, leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogram of PCA Embedding Distances')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# 2. Compute the pairwise distances and plot for UMAP embeddings
umap_distances = squareform(pdist(umap_result, metric='euclidean'))
plt.figure(figsize=(12, 10))
sns.heatmap(umap_distances, cmap='viridis', cbar=True)
plt.title('Heatmap of UMAP Embedding Distances')
plt.xlabel('Embedding Index')
plt.ylabel('Embedding Index')
plt.show()

# Dendrogram for UMAP embeddings
plt.figure(figsize=(15, 10))
linkage_matrix_umap = linkage(umap_result, method='ward')
dendrogram(linkage_matrix_umap, leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogram of UMAP Embedding Distances')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# 3. Compute the pairwise distances and plot for t-SNE embeddings
tsne_distances = squareform(pdist(tsne_result, metric='euclidean'))
plt.figure(figsize=(12, 10))
sns.heatmap(tsne_distances, cmap='viridis', cbar=True)
plt.title('Heatmap of t-SNE Embedding Distances')
plt.xlabel('Embedding Index')
plt.ylabel('Embedding Index')
plt.show()

# Dendrogram for t-SNE embeddings
plt.figure(figsize=(15, 10))
linkage_matrix_tsne = linkage(tsne_result, method='ward')
dendrogram(linkage_matrix_tsne, leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogram of t-SNE Embedding Distances')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()
