In [1]:
import io
import requests
import gzip
import shutil
import pandas as pd
import numpy as np

In [2]:
# URL of the file you want to download
url = 'https://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz'

# Send a GET request to the URL
response = requests.get(url)

# Save the content to a file
with open('../data/netics_inputs/raw/protein_info.v12.txt.gz', 'wb') as file:
    file.write(response.content)

print("File downloaded successfully")

File downloaded successfully


In [3]:
# Path to the .gz file
gz_file_path = '../data/netics_inputs/raw/protein_info.v12.txt.gz'

# Path to extract the contents
output_file_path = '../data/netics_inputs/raw/protein_info.v12.txt'

# Open the .gz file and extract its contents
with gzip.open(gz_file_path, 'rb') as f_in:
    with open(output_file_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("File unzipped successfully")

File unzipped successfully


#### Construct adjacency matrix

In [4]:
# Load interaction data
interaction_data = pd.read_csv('../data/netics_inputs/raw/protein_info.v12.txt', sep=' ')

In [5]:
interaction_data.head()

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,173
1,9606.ENSP00000000233,9606.ENSP00000427567,154
2,9606.ENSP00000000233,9606.ENSP00000253413,151
3,9606.ENSP00000000233,9606.ENSP00000493357,471
4,9606.ENSP00000000233,9606.ENSP00000324127,201


In [6]:
interaction_data.protein1.nunique()

19622

In [7]:
interaction_data.shape

(13715404, 3)

In [8]:
# Get unique proteins/genes
proteins = list(set(interaction_data['protein1']).union(set(interaction_data['protein2'])))

# Initialize adjacency matrix
adj_matrix = np.zeros((len(proteins), len(proteins)))

# Create a mapping from protein name to index
protein_to_index = {protein: idx for idx, protein in enumerate(proteins)}

# Set a maximum score for normalization (e.g., 1000)
max_score = interaction_data['combined_score'].max()

# Populate the adjacency matrix with normalized scores
for _, row in interaction_data.iterrows():
    normalized_score = row['combined_score'] / max_score
    i = protein_to_index[row['protein1']]
    j = protein_to_index[row['protein2']]
    adj_matrix[i, j] = normalized_score
    adj_matrix[j, i] = normalized_score  # For undirected networks

# Convert to DataFrame for better visualization
adj_matrix_df = pd.DataFrame(adj_matrix, index=proteins, columns=proteins)

In [9]:
adj_matrix_df.shape

(19622, 19622)

In [10]:
adj_matrix_df.head()

Unnamed: 0,9606.ENSP00000292609,9606.ENSP00000428968,9606.ENSP00000370546,9606.ENSP00000497550,9606.ENSP00000475344,9606.ENSP00000315569,9606.ENSP00000358022,9606.ENSP00000448012,9606.ENSP00000359581,9606.ENSP00000372313,...,9606.ENSP00000297591,9606.ENSP00000426909,9606.ENSP00000301061,9606.ENSP00000271332,9606.ENSP00000301891,9606.ENSP00000336762,9606.ENSP00000357336,9606.ENSP00000361700,9606.ENSP00000367747,9606.ENSP00000304071
9606.ENSP00000292609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9606.ENSP00000428968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9606.ENSP00000370546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9606.ENSP00000497550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.518519,0.0,0.255255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9606.ENSP00000475344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Create a StringIO object to capture CSV output in memory
csv_buffer = io.StringIO()

# Export DataFrame to CSV in memory
adj_matrix_df.to_csv(csv_buffer, index=False)

# Get the size of the CSV output in bytes
csv_size = csv_buffer.getvalue().encode('utf-8')

print(f'The size of the DataFrame as a CSV file is {len(csv_size)} bytes.')

The size of the DataFrame as a CSV file is 1754110918 bytes.


In [12]:
adj_matrix_df.to_csv('../data/netics_inputs/processed/adjacency_matrix_headed.csv', sep='\t', index=False)

In [13]:
adj_matrix_df.to_csv('../data/netics_inputs/processed/adjacency_matrix.csv', sep='\t', index=False, header=False)