In [None]:
!pip install torch.geometric

Collecting torch.geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m0.7/1.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch.geometric
Successfully installed torch.geometric-2.6.1


# Data Initialization

In [None]:
import torch
from torch_geometric.data import HeteroData
import pandas as pd

# Load Data
gene_cell_matrix = pd.read_csv("/content/GSE135893_filtered_5000.csv", index_col=0)
gene_interactions = pd.read_csv("/content/human_lr_pair.csv")
graph_df = pd.read_csv("/content/GSE135893_graph.csv")
cell_numbers_df = pd.read_csv("/content/cell_numbers_5000.csv")
#cell_metadata_df = pd.read_csv("/content/metadata_5000.csv")

# Initialize HeteroData
data = HeteroData()

# Add Nodes
gene_nodes = gene_cell_matrix.index.tolist()
cell_nodes = gene_cell_matrix.columns.tolist()
data['gene'].x = torch.ones((len(gene_nodes), 1))  # Feature vector size 1 for genes
data['cell'].x = torch.ones((len(cell_nodes), 1))  # Feature vector size 1 for cells
data['gene'].node_id = torch.arange(len(gene_nodes))
data['cell'].node_id = torch.arange(len(cell_nodes))

# Add Gene-Cell Edges
gene_idx_map = {gene: i for i, gene in enumerate(gene_nodes)}
cell_idx_map = {cell: i for i, cell in enumerate(cell_nodes)}

gene_cell_edges = []
gene_cell_weights = []

for gene, row in gene_cell_matrix.iterrows():
    for cell, weight in row.items():
        if weight > 0:
            gene_cell_edges.append((gene_idx_map[gene], cell_idx_map[cell]))
            gene_cell_weights.append(weight)

data['gene', 'expressed_in', 'cell'].edge_index = torch.tensor(gene_cell_edges, dtype=torch.long).t().contiguous()
data['gene', 'expressed_in', 'cell'].edge_attr = torch.tensor(gene_cell_weights, dtype=torch.float)

data['gene', 'expressed_in', 'cell'].edge_label_index = data['gene', 'expressed_in', 'cell'].edge_index
data['gene', 'expressed_in', 'cell'].edge_label = (data['gene', 'expressed_in', 'cell'].edge_attr > 0.5).long()

# Add Gene-Gene Edges
gene_gene_edges = []

for _, row in gene_interactions.iterrows():
    gene_a, gene_b = row['ligand'], row['receptor']
    if gene_a in gene_idx_map and gene_b in gene_idx_map:
        gene_gene_edges.append((gene_idx_map[gene_a], gene_idx_map[gene_b]))

data['gene', 'interacts_with', 'gene'].edge_index = torch.tensor(gene_gene_edges, dtype=torch.long).t().contiguous()

# Map cell names to indices from cell_numbers.csv
cell_name_to_index = dict(zip(cell_numbers_df['Cell'], cell_numbers_df['Number']))

# Prepare Cell-Cell Edges
cell_cell_edges = []
cell_cell_weights = []

for _, row in graph_df.iterrows():
    node_a, node_b, weight = row['NodeA'], row['NodeB'], row['Weights']
    cell_cell_edges.append((node_a, node_b))
    cell_cell_weights.append(weight)

# Convert Cell-Cell Edges to Tensors
cell_cell_edge_index = torch.tensor(cell_cell_edges, dtype=torch.long).t().contiguous()
cell_cell_edge_attr = torch.tensor(cell_cell_weights, dtype=torch.float)

# Add Cell-Cell Edges to HeteroData
data['cell', 'interacts_with', 'cell'].edge_index = cell_cell_edge_index
data['cell', 'interacts_with', 'cell'].edge_attr = cell_cell_edge_attr

# Metadata
metadata = (list(data.node_types), list(data.edge_types))

# Metadata for Debugging
print("Node Types:", list(data.node_types))
print("Edge Types:", list(data.edge_types))
print("Cell-Cell Edge Index Shape:", data['cell', 'interacts_with', 'cell'].edge_index.shape)
print("Cell-Cell Edge Attributes Shape:", data['cell', 'interacts_with', 'cell'].edge_attr.shape)

# Add Cell Type Metadata
# Map unique cell types to integers
#unique_cell_types = cell_metadata_df['Cell_Type'].unique()
#cell_type_to_int = {ctype: idx for idx, ctype in enumerate(unique_cell_types)}

# Map cell barcodes to their integer-encoded cell types
#cell_type_map = {row['Cell_Barcode']: cell_type_to_int[row['Cell_Type']] for _, row in cell_metadata_df.iterrows()}

# Encode cell type IDs for each cell node
#cell_type_ids = [cell_type_map.get(cell, -1) for cell in cell_nodes]  # Use -1 for missing types

# Convert to one-hot encoding
#cell_type_ids_tensor = torch.tensor(cell_type_ids, dtype=torch.long)
#cell_type_one_hot = F.one_hot(cell_type_ids_tensor, num_classes=len(unique_cell_types)).float()

# Append to existing cell features
#data['cell'].x = torch.cat([data['cell'].x, cell_type_one_hot], dim=1)


Node Types: ['gene', 'cell']
Edge Types: [('gene', 'expressed_in', 'cell'), ('gene', 'interacts_with', 'gene'), ('cell', 'interacts_with', 'cell')]
Cell-Cell Edge Index Shape: torch.Size([2, 9990])
Cell-Cell Edge Attributes Shape: torch.Size([9990])


**Split data to training, testing, and validation sets**

In [None]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
assert train_ratio + val_ratio + test_ratio == 1.0

In [None]:
edge_index = data[('gene', 'expressed_in', 'cell')].edge_index       # shape [2, num_edges]
edge_label_index = data[('gene', 'expressed_in', 'cell')].edge_label_index # shape [2, num_edges]
edge_label = data[('gene', 'expressed_in', 'cell')].edge_label       # shape [num_edges]

In [None]:
import numpy as np
num_edges = edge_label.size(0)
indices = np.arange(num_edges)
np.random.shuffle(indices)

train_size = int(train_ratio * num_edges)
val_size = int(val_ratio * num_edges)
test_size = num_edges - train_size - val_size

train_indices = indices[:train_size]
val_indices = indices[train_size:train_size+val_size]
test_indices = indices[train_size+val_size:]

In [None]:
data[('gene', 'expressed_in', 'cell')].train_edge_index = edge_label_index[:, train_indices]
data[('gene', 'expressed_in', 'cell')].train_edge_label = edge_label[train_indices]

data[('gene', 'expressed_in', 'cell')].val_edge_index = edge_label_index[:, val_indices]
data[('gene', 'expressed_in', 'cell')].val_edge_label = edge_label[val_indices]

data[('gene', 'expressed_in', 'cell')].test_edge_index = edge_label_index[:, test_indices]
data[('gene', 'expressed_in', 'cell')].test_edge_label = edge_label[test_indices]

# Link prediction with HGT

In [None]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from torch_geometric.utils import train_test_split_edges
import torch.nn.functional as F

In [None]:
# With MLP predictor
import torch
import torch.nn as nn
from torch_geometric.nn import HGTConv

class HGTLinkPredictor(torch.nn.Module):
    def __init__(self, metadata, hidden_channels, out_channels, num_heads, num_layers, dropout_rate=0.6):
        super(HGTLinkPredictor, self).__init__()
        self.layers = torch.nn.ModuleList()
        self.dropout_rate = dropout_rate
        self.dropout = nn.Dropout(p=dropout_rate)

        # HGT Layers
        for _ in range(num_layers):
            self.layers.append(
                HGTConv(
                    in_channels=-1,  # Infer input channels automatically
                    out_channels=hidden_channels,
                    metadata=metadata,
                    heads=num_heads
                )
            )

        # Final projection for node embeddings to out_channels
        self.final_proj = nn.Linear(hidden_channels, out_channels)

        # MLP for edge scoring:
        # Input: concatenated source and destination embeddings (2 * out_channels)
        # Output: a single edge score
        self.edge_mlp = nn.Sequential(
            nn.Linear(2 * out_channels, hidden_channels),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_channels, 1)
        )

    def forward(self, x_dict, edge_index_dict):
        # Pass node features through HGT layers
        for layer in self.layers:
            x_dict = layer(x_dict, edge_index_dict)
            # Apply dropout
            for node_type in x_dict:
                x_dict[node_type] = self.dropout(x_dict[node_type])

        # Project embeddings to out_channels dimension
        for node_type in x_dict:
            x_dict[node_type] = self.final_proj(x_dict[node_type])
        return x_dict

    def predict_edges(self, x_dict, edge_label_index, node_type_src, node_type_dst):
        # Retrieve embeddings for the source and destination nodes
        src_embeddings = x_dict[node_type_src][edge_label_index[0]]
        dst_embeddings = x_dict[node_type_dst][edge_label_index[1]]

        # Concatenate the embeddings
        concatenated_embeddings = torch.cat([src_embeddings, dst_embeddings], dim=-1)

        # MLP for link prediction
        edge_scores = self.edge_mlp(concatenated_embeddings).squeeze(-1)
        return edge_scores

In [None]:
#del data[('cell', 'interacts_with', 'cell')]

In [None]:
hidden_channels = 64
heads = 4
num_layers = 2
out_channels = 32

model = HGTLinkPredictor(metadata, hidden_channels, out_channels=64, num_heads=heads, num_layers=num_layers, dropout_rate=0.6)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)  # L2 regularization with weight_decay
criterion = torch.nn.BCEWithLogitsLoss()

train_edge_label_index = data[('gene', 'expressed_in', 'cell')].train_edge_index
train_edge_labels = data[('gene', 'expressed_in', 'cell')].train_edge_label

for epoch in range(10):
    model.train()
    optimizer.zero_grad()

    x_dict = model(data.x_dict, data.edge_index_dict)

    edge_scores = model.predict_edges(x_dict, train_edge_label_index, 'gene', 'cell') # Predict on training edges only

    edge_labels = train_edge_labels.to(edge_scores.device)
    loss = criterion(edge_scores, edge_labels.float())

    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    test_edge_label_index = data[('gene', 'expressed_in', 'cell')].test_edge_index
    test_edge_scores = model.predict_edges(x_dict, test_edge_label_index, 'gene', 'cell')
    print("Sample predictions on test set:", test_edge_scores[:5])

Epoch 1, Loss: 0.6773214936256409
Epoch 2, Loss: 0.5622010827064514
Epoch 3, Loss: 0.41519662737846375
Epoch 4, Loss: 0.3007010221481323
Epoch 5, Loss: 0.40280938148498535
Epoch 6, Loss: 0.37188494205474854
Epoch 7, Loss: 0.3104037046432495
Epoch 8, Loss: 0.30102434754371643
Epoch 9, Loss: 0.3130376935005188
Epoch 10, Loss: 0.3146982192993164
Sample predictions on test set: tensor([1.9545, 1.8972, 1.7733, 1.8858, 1.7553])


In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score

def evaluate(model, data, edge_type, src_type, dst_type, subset='test'):
    model.eval()
    with torch.no_grad():
        x_dict = model(data.x_dict, data.edge_index_dict)

        edge_label_index = data[edge_type][f'{subset}_edge_index']
        edge_labels = data[edge_type][f'{subset}_edge_label'].to(next(model.parameters()).device)

        edge_scores = model.predict_edges(x_dict, edge_label_index, src_type, dst_type)

        edge_probs = torch.sigmoid(edge_scores) # Convert scores to probabilities
        edge_predictions = (edge_probs > 0.5).long()

        acc = accuracy_score(edge_labels.cpu(), edge_predictions.cpu())

        auc = roc_auc_score(edge_labels.cpu(), edge_probs.cpu())

        return acc, auc

In [None]:
edge_type = ('gene', 'expressed_in', 'cell')
src_type, dst_type = 'gene', 'cell'

train_acc, train_auc = evaluate(model, data, edge_type, src_type, dst_type, subset='train')
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Train AUC: {train_auc:.4f}")

test_acc, test_auc = evaluate(model, data, edge_type, src_type, dst_type, subset='test')
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test AUC: {test_auc:.4f}")

Train Accuracy: 0.9184
Train AUC: 0.7335
Test Accuracy: 0.9181
Test AUC: 0.7338


# VAE-HGT

In [None]:
import torch.nn as nn

class HGTLinkPredictorVAE(torch.nn.Module):
    def __init__(self, metadata, hidden_channels, latent_dim, num_heads, num_layers, dropout=0.6):
        super(HGTLinkPredictorVAE, self).__init__()
        self.layers = torch.nn.ModuleList()
        self.dropout = nn.Dropout(p=dropout)

        # Encoder: HGT layers to process the graph
        for _ in range(num_layers):
            self.layers.append(
                HGTConv(
                    in_channels=-1,  # Automatically infer input channels
                    out_channels=hidden_channels,
                    metadata=metadata,  # Node and edge types
                    heads=num_heads
                )
            )

        # MLPs to produce latent space means and variances
        self.mu_proj = nn.Linear(hidden_channels, latent_dim)
        self.logvar_proj = nn.Linear(hidden_channels, latent_dim)

        # Decoder: Use MLP for edge weight prediction
        self.decoder_mlp = nn.Sequential(
            nn.Linear(2 * latent_dim, hidden_channels),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_channels, 1)  # Single output for edge weight
        )

    def encode(self, x_dict, edge_index_dict):
        # Pass node features through HGT layers with dropout
        for layer in self.layers:
            x_dict = layer(x_dict, edge_index_dict)
            x_dict = {key: self.dropout(x) for key, x in x_dict.items()}
        mu_dict = {key: self.mu_proj(x) for key, x in x_dict.items()}
        logvar_dict = {key: self.logvar_proj(x) for key, x in x_dict.items()}
        return mu_dict, logvar_dict

    def decode(self, z_dict, edge_label_index, node_type_src, node_type_dst):
        # Decode latent embeddings to predict edge weights
        src_embeddings = z_dict[node_type_src][edge_label_index[0]]
        dst_embeddings = z_dict[node_type_dst][edge_label_index[1]]
        concatenated_embeddings = torch.cat([src_embeddings, dst_embeddings], dim=-1)
        edge_weights = self.decoder_mlp(concatenated_embeddings).squeeze(-1)  # Remove last dim for output
        return edge_weights

        # Reparameterization trick
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x_dict, edge_index_dict, edge_label_index, node_type_src, node_type_dst):
        mu_dict, logvar_dict = self.encode(x_dict, edge_index_dict) # Encode the graph into latent space
        z_dict = {
        key: self.reparameterize(mu, logvar)
        for key, (mu, logvar) in zip(mu_dict.keys(), zip(mu_dict.values(), logvar_dict.values()))
        }

        edge_weights = self.decode(z_dict, edge_label_index, node_type_src, node_type_dst) # Decode latent space to predict edge weights
        return edge_weights, mu_dict, logvar_dict

 # VAE Loss function
def vae_loss(edge_weights_pred, edge_weights_true, mu_dict, logvar_dict, beta=0.1):
    recon_loss = F.mse_loss(edge_weights_pred, edge_weights_true, reduction='mean')
    kl_loss = 0
    for mu, logvar in zip(mu_dict.values(), logvar_dict.values()):
        kl_loss += -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl_loss

In [None]:
from sklearn.preprocessing import MinMaxScaler
import torch
from torch_geometric.nn import HGTConv
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.optim import Adam

metadata = (list(data.node_types), list(data.edge_types))

hidden_channels = 64
latent_dim = 32
num_heads = 4
num_layers = 2

# Normalize edge weights
scaler = MinMaxScaler()
data[('gene', 'expressed_in', 'cell')].edge_attr = torch.tensor(
    scaler.fit_transform(data[('gene', 'expressed_in', 'cell')].edge_attr.numpy().reshape(-1, 1)),  # Reshape to 2D
    dtype=torch.float
)

model = HGTLinkPredictorVAE(
    metadata=metadata,
    hidden_channels=hidden_channels,
    latent_dim=latent_dim,
    num_heads=num_heads,
    num_layers=num_layers,
    dropout=0.6
)
optimizer = Adam(model.parameters(), lr=0.01)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    x_dict = data.x_dict
    edge_index_dict = data.edge_index_dict

    edge_label_index = data[('gene', 'expressed_in', 'cell')].edge_index
    edge_attr = data[('gene', 'expressed_in', 'cell')].edge_attr.float()

    edge_weights_pred, mu_dict, logvar_dict = model(
        x_dict=x_dict,
        edge_index_dict=edge_index_dict,
        edge_label_index=edge_label_index,
        node_type_src='gene',
        node_type_dst='cell'
    )

    loss = vae_loss(edge_weights_pred, edge_attr.squeeze(-1), mu_dict, logvar_dict)  # Squeeze extra dim
    loss.backward()  # Backpropagate gradients
    optimizer.step()  # Update model weights

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

Epoch 1/50, Loss: 251.9503
Epoch 2/50, Loss: 100.0793
Epoch 3/50, Loss: 45.8169
Epoch 4/50, Loss: 26.5693
Epoch 5/50, Loss: 22.0141
Epoch 6/50, Loss: 22.5512
Epoch 7/50, Loss: 19.8316
Epoch 8/50, Loss: 16.8903
Epoch 9/50, Loss: 12.9964
Epoch 10/50, Loss: 11.5743
Epoch 11/50, Loss: 11.1299
Epoch 12/50, Loss: 10.7856
Epoch 13/50, Loss: 9.2543
Epoch 14/50, Loss: 7.4637
Epoch 15/50, Loss: 6.1150
Epoch 16/50, Loss: 5.7970
Epoch 17/50, Loss: 5.9848
Epoch 18/50, Loss: 5.6359
Epoch 19/50, Loss: 4.9511
Epoch 20/50, Loss: 3.8052
Epoch 21/50, Loss: 3.2883
Epoch 22/50, Loss: 3.0083
Epoch 23/50, Loss: 3.0835
Epoch 24/50, Loss: 3.0340
Epoch 25/50, Loss: 2.5815
Epoch 26/50, Loss: 2.2120
Epoch 27/50, Loss: 1.8707
Epoch 28/50, Loss: 1.8444
Epoch 29/50, Loss: 1.8397
Epoch 30/50, Loss: 1.7028
Epoch 31/50, Loss: 1.4873
Epoch 32/50, Loss: 1.2235
Epoch 33/50, Loss: 1.0971
Epoch 34/50, Loss: 1.1344
Epoch 35/50, Loss: 1.1586
Epoch 36/50, Loss: 1.0924
Epoch 37/50, Loss: 0.9697
Epoch 38/50, Loss: 0.8282
Epoch 3

In [None]:
model.eval()
with torch.no_grad():
    edge_weights_pred, _, _ = model(
        x_dict=data.x_dict,
        edge_index_dict=data.edge_index_dict,
        edge_label_index=data[('gene', 'expressed_in', 'cell')].edge_index,
        node_type_src='gene',
        node_type_dst='cell'
    )

true_edge_weights = data[('gene', 'expressed_in', 'cell')].edge_attr.float()

predicted = edge_weights_pred.cpu().numpy()
true = true_edge_weights.cpu().numpy()

mse = mean_squared_error(true, predicted)
mae = mean_absolute_error(true, predicted)
r2 = r2_score(true, predicted)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error (MSE): 0.0092
Mean Absolute Error (MAE): 0.0693
R² Score: -0.0074


**Impute unknown gene-cell interactions**

In [None]:
# Find missing edges (zero or NaN entries in the gene-cell matrix)
missing_gene_cell_edges = []
for gene, row in gene_cell_matrix.iterrows():
    for cell, weight in row.items():
        if pd.isna(weight) or weight == 0:  # Missing or zero expression
            missing_gene_cell_edges.append((gene_idx_map[gene], cell_idx_map[cell]))

# Convert missing edges to PyTorch tensor
missing_edge_index = torch.tensor(missing_gene_cell_edges, dtype=torch.long).t().contiguous()

In [None]:
model.eval()

with torch.no_grad():
    missing_edge_weights, _, _ = model(
        x_dict=data.x_dict,
        edge_index_dict=data.edge_index_dict,
        edge_label_index=missing_edge_index,
        node_type_src='gene',
        node_type_dst='cell'
    )

In [None]:
# Convert predicted edge weights back to the original scale
imputed_edge_weights = scaler.inverse_transform(missing_edge_weights.cpu().numpy().reshape(-1, 1))

# Insert imputed values back into the gene-cell matrix
for i, (gene_idx, cell_idx) in enumerate(zip(missing_edge_index[0], missing_edge_index[1])):
    gene_name = gene_nodes[gene_idx]
    cell_name = cell_nodes[cell_idx]
    gene_cell_matrix.at[gene_name, cell_name] = imputed_edge_weights[i][0]  # Access the first column

In [None]:
# Insert predicted values into the gene-cell matrix
for i, (gene_idx, cell_idx) in enumerate(zip(missing_edge_index[0], missing_edge_index[1])):
    gene_name = gene_nodes[gene_idx]
    cell_name = cell_nodes[cell_idx]
    gene_cell_matrix.at[gene_name, cell_name] = missing_edge_weights[i].item()

In [None]:
gene_cell_matrix

Unnamed: 0,F01172_TGTTCCGCAAGCGCTC,F01172_TTCGGTCTCCCAAGTA,F01173_TGGCCAGGTCTCATCC,F01214_ACTTACTTCAGCTTAG,F01214_CATGGCGGTTACAGAA,F01214_CGGTTAAGTTTAGCTG,F01214_GTAGGCCAGATCACGG,F01214_GTCGTAAGTAGATTAG,F01214_TGCCCATGTGTGACCC,F01302_AAAGTAGTCGGACAAG,...,F01379_GTCTTCGAGGGCTTCC,F01380_ATTGGTGCAATCTGCA,ILD53_CTGGTCTAGGCCATAG,ILD59-2_ACGTCAATCGTCTGAA,ILD59-2_ATAGACCCACCACCAG,ILD59-2_GACACGCCATGCATGT,ILD59-2_GACTGCGAGTTATCGC,ILD59-2_GTCACGGCAGGGTACA,ILD59-2_TTTGCGCAGGCGTACA,ILD61-2_AAGCCGCGTAGGGACT
NOC2L,0.132207,0.128700,0.134280,0.130609,0.124514,0.147926,0.118369,0.120092,0.979477,0.130139,...,0.120348,1.113017,0.127541,0.946255,0.129790,0.123947,0.131010,0.131952,0.135419,0.131708
HES4,0.128828,0.136269,0.140693,0.136876,0.121971,0.148905,0.128850,0.123038,0.129690,1.203926,...,0.126060,0.124526,0.122171,0.946255,0.138738,0.126670,1.543990,0.138108,0.127655,1.412584
ISG15,0.721506,0.139763,0.155502,0.131723,0.134529,0.167998,0.141503,0.136822,0.135779,0.140998,...,0.136323,0.131617,0.147290,0.134778,0.867131,0.691402,0.145537,0.135058,0.910451,0.135473
AGRN,0.141832,0.125777,0.143240,0.130732,0.141806,0.159812,0.148623,0.852358,1.464673,0.130021,...,0.136936,0.134507,0.141954,0.128898,0.144517,0.123337,0.132777,0.141687,0.140293,0.139759
TNFRSF18,0.140232,0.153170,0.159076,0.144289,0.134745,0.159970,0.139270,0.140935,0.135894,0.152133,...,0.134679,0.140745,0.144519,0.131350,0.144487,0.134632,0.150834,0.142395,0.149887,0.150068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4L,3.768180,4.205678,3.384758,3.093911,3.138752,2.186628,2.364173,3.844511,2.660575,0.140527,...,3.655191,3.239554,1.016559,3.266397,3.445584,3.292473,3.926394,3.923366,3.374970,3.862798
MT-ND4,3.144502,4.091989,2.650340,2.641848,3.925312,2.069012,2.692234,3.114525,3.256016,4.001189,...,3.114782,2.728159,1.839121,2.720268,2.596794,2.561725,3.416609,1.459706,2.935202,3.467782
MT-ND5,3.097864,3.219836,2.794350,1.611842,3.138752,0.687201,1.322536,2.987317,2.035000,1.734546,...,2.205325,2.584840,0.123954,2.183826,2.066908,1.788849,2.866454,1.459706,1.937557,3.124384
MT-ND6,0.138770,0.789003,0.143816,0.132404,0.144787,0.159284,0.865613,2.841535,0.144649,0.142182,...,0.134272,2.098306,0.143172,0.139087,0.145965,1.096284,1.044375,0.133808,0.910451,0.147786


In [None]:
gene_cell_matrix.to_csv("GSE135893_filtered_hgt_imputed.csv")

In [None]:
gene_cell_matrix_original = pd.read_csv("GSE135893_filtered_5000.csv", index_col=0)

In [None]:
gene_cell_matrix_original

Unnamed: 0,F01172_TGTTCCGCAAGCGCTC,F01172_TTCGGTCTCCCAAGTA,F01173_TGGCCAGGTCTCATCC,F01214_ACTTACTTCAGCTTAG,F01214_CATGGCGGTTACAGAA,F01214_CGGTTAAGTTTAGCTG,F01214_GTAGGCCAGATCACGG,F01214_GTCGTAAGTAGATTAG,F01214_TGCCCATGTGTGACCC,F01302_AAAGTAGTCGGACAAG,...,F01379_GTCTTCGAGGGCTTCC,F01380_ATTGGTGCAATCTGCA,ILD53_CTGGTCTAGGCCATAG,ILD59-2_ACGTCAATCGTCTGAA,ILD59-2_ATAGACCCACCACCAG,ILD59-2_GACACGCCATGCATGT,ILD59-2_GACTGCGAGTTATCGC,ILD59-2_GTCACGGCAGGGTACA,ILD59-2_TTTGCGCAGGCGTACA,ILD61-2_AAGCCGCGTAGGGACT
NOC2L,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.979477,0.000000,...,0.000000,1.113017,0.000000,0.946255,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
HES4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.203926,...,0.000000,0.000000,0.000000,0.946255,0.000000,0.000000,1.543990,0.000000,0.000000,1.412584
ISG15,0.721506,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.867131,0.691402,0.000000,0.000000,0.910451,0.000000
AGRN,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.852358,1.464673,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TNFRSF18,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4L,3.768180,4.205678,3.384758,3.093911,3.138752,2.186628,2.364173,3.844511,2.660575,0.000000,...,3.655191,3.239554,1.016559,3.266397,3.445584,3.292473,3.926394,3.923366,3.374970,3.862798
MT-ND4,3.144502,4.091989,2.650340,2.641848,3.925312,2.069012,2.692234,3.114525,3.256016,4.001189,...,3.114782,2.728159,1.839121,2.720268,2.596794,2.561725,3.416609,1.459706,2.935202,3.467782
MT-ND5,3.097864,3.219836,2.794350,1.611842,3.138752,0.687201,1.322536,2.987317,2.035000,1.734546,...,2.205325,2.584840,0.000000,2.183826,2.066908,1.788849,2.866454,1.459706,1.937557,3.124384
MT-ND6,0.000000,0.789003,0.000000,0.000000,0.000000,0.000000,0.865613,2.841535,0.000000,0.000000,...,0.000000,2.098306,0.000000,0.000000,0.000000,1.096284,1.044375,0.000000,0.910451,0.000000
