In [1]:
import pandas as pd

# Load and inspect the data file provided by the user
file_path = 'uniprotkb_AND_model_organism_9606_2025_02_07.tsv'
data = pd.read_csv(file_path, sep='\t')

# Display basic information and first few rows of the dataset
data_info = data.info()
data_head = data.head()

data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205003 entries, 0 to 205002
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Entry          205003 non-null  object
 1   Entry Name     205003 non-null  object
 2   Protein names  205003 non-null  object
 3   Gene Names     165804 non-null  object
 4   Organism       205003 non-null  object
 5   Sequence       205003 non-null  object
 6   EC number      22557 non-null   object
 7   InterPro       181534 non-null  object
dtypes: object(8)
memory usage: 12.5+ MB


(None,
         Entry        Entry Name   
 0  A0A024R1X5  A0A024R1X5_HUMAN  \
 1  A0A024R274  A0A024R274_HUMAN   
 2  A0A024R324  A0A024R324_HUMAN   
 3  A0A024R6A3  A0A024R6A3_HUMAN   
 4  A0A024R7I7  A0A024R7I7_HUMAN   
 
                                        Protein names       Gene Names   
 0                                           Beclin-1  BECN1 hCG_16958  \
 1  Mothers against decapentaplegic homolog (MAD h...            SMAD4   
 2  Epididymis secretory sperm binding protein (Ra...   RHOA hCG_20136   
 3                           Presenilin (EC 3.4.23.-)            PSEN1   
 4                          Ras-related protein Rab-3              NaN   
 
                Organism                                           Sequence   
 0  Homo sapiens (Human)  MEGSKTSNNSTMQVSFVCQRCSQPLKLDTSFKILDRVTIQELTAPL...  \
 1  Homo sapiens (Human)  MDNMSITNTPTSNDACLSIVHSLMCHRQGGESETFAKRAIESLVKK...   
 2  Homo sapiens (Human)  MAAIRKKLVIVGDGACGKTCLLIVFSKDQFPEVYVPTVFENYVADI...   
 3  Homo sapi

In [2]:
# Предобработка данных

# Уберем записи без данных по доменам (InterPro)
protein_data_clean = data.dropna(subset=['InterPro']).copy()

# Конвертируем строки с доменами InterPro в списки для удобного дальнейшего анализа
protein_data_clean['InterPro_list'] = protein_data_clean['InterPro'].apply(lambda x: x.strip(';').split(';'))

# Оставим только нужные для дальнейших задач колонки (идентификатор, имя белка, список доменов, EC номер)
protein_data_final = protein_data_clean[['Entry', 'Entry Name', 'Protein names', 'Gene Names', 'EC number', 'InterPro_list']]

# Показать пример обработанных данных
protein_data_final.head()


Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,EC number,InterPro_list
0,A0A024R1X5,A0A024R1X5_HUMAN,Beclin-1,BECN1 hCG_16958,,"[IPR007243, IPR038274, IPR041691, IPR040455, I..."
1,A0A024R274,A0A024R274_HUMAN,Mothers against decapentaplegic homolog (MAD h...,SMAD4,,"[IPR013790, IPR003619, IPR013019, IPR017855, I..."
2,A0A024R324,A0A024R324_HUMAN,Epididymis secretory sperm binding protein (Ra...,RHOA hCG_20136,,"[IPR027417, IPR005225, IPR001806, IPR003578]"
3,A0A024R6A3,A0A024R6A3_HUMAN,Presenilin (EC 3.4.23.-),PSEN1,3.4.23.-,"[IPR002031, IPR001108, IPR006639, IPR042524]"
4,A0A024R7I7,A0A024R7I7_HUMAN,Ras-related protein Rab-3,,,"[IPR027417, IPR037872, IPR005225, IPR001806, I..."


In [None]:
import networkx as nx
from itertools import combinations
from tqdm import tqdm

# Возьмём небольшую выборку данных для проверки подхода (первые 1000 белков)
sample_data = protein_data_final.head(1000).reset_index(drop=True)

# Инициализируем пустой граф
protein_graph = nx.Graph()

# Добавляем узлы (белки)
for idx, row in sample_data.iterrows():
    protein_graph.add_node(row['Entry'], 
                        entry_name=row['Entry Name'],
                        protein_names=row['Protein names'],
                        gene_names=row['Gene Names'],
                        ec_number=row['EC number'],
                        interpro_domains=set(row['InterPro_list']))

# Рассчитываем коэффициент Жаккара и добавляем ребра
# (для примера используем порог сходства 0.3)
similarity_threshold = 0.3

# Проходимся по комбинациям узлов и считаем сходство
for (u, v) in tqdm(combinations(sample_data['Entry'], 2), total=(len(sample_data)*(len(sample_data)-1)//2)):
    domains_u = protein_graph.nodes[u]['interpro_domains']
    domains_v = protein_graph.nodes[v]['interpro_domains']
    
    intersection = len(domains_u.intersection(domains_v))
    union = len(domains_u.union(domains_v))
    jaccard_similarity = intersection / union if union else 0

    if jaccard_similarity >= similarity_threshold:
        protein_graph.add_edge(u, v, weight=jaccard_similarity)

# Проверим краткую статистику по построенному графу
graph_info = {
    'Количество узлов': protein_graph.number_of_nodes(),
    'Количество рёбер': protein_graph.number_of_edges()
}

graph_info


100%|██████████| 499500/499500 [00:00<00:00, 764926.20it/s]


{'Количество узлов': 1000, 'Количество рёбер': 3498}

In [7]:
from neo4j import GraphDatabase
import networkx as nx

# Настройки подключения к Neo4j
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "12345678")

driver = GraphDatabase.driver(URI, auth=AUTH)

# Функция загрузки графа в Neo4j
def load_graph(tx, graph):
    # Создание узлов
    for node, data in graph.nodes(data=True):
        tx.run(
            "MERGE (p:Protein {entry: $entry}) "
            "SET p.entry_name = $entry_name, "
            "    p.protein_names = $protein_names, "
            "    p.gene_names = $gene_names, "
            "    p.ec_number = $ec_number",
            entry=node,
            entry_name=data['entry_name'],
            protein_names=data['protein_names'],
            gene_names=data['gene_names'],
            ec_number=data['ec_number']
        )

    # Создание ребер
    for u, v, data in graph.edges(data=True):
        tx.run(
            "MATCH (a:Protein {entry: $u}), (b:Protein {entry: $v}) "
            "MERGE (a)-[r:SIMILARITY {weight: $weight}]->(b)",
            u=u, v=v, weight=data['weight']
        )

# Загрузка графа в Neo4j
with driver.session() as session:
    session.execute_write(load_graph, protein_graph)

driver.close()


In [1]:
import st_link_analysis



ModuleNotFoundError: No module named 'st_link_analysis'