In [1]:
## Install the packages if needed
# %pip install -r requirements.txt

Collecting pymysql (from -r requirements.txt (line 1))Note: you may need to restart the kernel to use updated packages.

  Downloading PyMySQL-1.1.0-py3-none-any.whl.metadata (4.4 kB)
Collecting cryptography (from -r requirements.txt (line 3))
  Downloading cryptography-41.0.7-cp37-abi3-win_amd64.whl.metadata (5.3 kB)
Collecting neo4j (from -r requirements.txt (line 5))
  Downloading neo4j-5.15.0.tar.gz (196 kB)
     ---------------------------------------- 0.0/196.5 kB ? eta -:--:--
     -- ------------------------------------- 10.2/196.5 kB ? eta -:--:--
     --------------------------------- ---- 174.1/196.5 kB 2.1 MB/s eta 0:00:01
     -------------------------------------- 196.5/196.5 kB 2.0 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backe

In [2]:
import pymysql
import pandas as pd
from sqlalchemy import create_engine
import neo4j
import numpy as np
import json
from tqdm import tqdm
import datetime

print(datetime.datetime.now())

2023-12-15 16:18:54.256836


# Test the containers from podman-compose

Once you have installed 
1. [podman](https://podman.io/docs/installation)
2. [podman Desktop](https://podman-desktop.io/)
3. podman-compose (`pip install podman-compose`)

Change the directory in the terminal to the folder where you have copied the [docker-compose.yaml](docker-compose.yaml).

`podman-compose up -d`

check if you see the container in Desktop and test with this Jupyter notebook if all the container works as expected.

`podman-compose down`

# MySQL

In [3]:
MYSQL_USER = "root"
MYSQL_HOST = "127.0.0.1"
MYSQL_PORT = 3307
MYSQL_ROOT_PASSWORD = "root_passwd"
MYSQL_DATABASE = "plab2_db"

create database cursor

In [4]:
conn = pymysql.connect(
    user=MYSQL_USER,
    password=MYSQL_ROOT_PASSWORD,
    host=MYSQL_HOST,
    port=MYSQL_PORT,
    database=MYSQL_DATABASE,
    autocommit=True,
)
cursor = conn.cursor()

create SQL Alchemy engine

In [5]:
connection_url = f"mysql+pymysql://{MYSQL_USER}:{MYSQL_ROOT_PASSWORD}@{MYSQL_HOST}:{MYSQL_PORT}/{MYSQL_DATABASE}"
engine = create_engine(connection_url)

In [6]:
def drop_table_if_exists(table: str):
    """Drops a table in MySQL database.

    Args:
        table (str): Name of table.
    """
    cursor.execute(f"DROP TABLE IF EXISTS `{table}`")

## Create tables

[MySQL reference](https://dev.mysql.com/doc/refman/8.0/en/create-table.html)

In [7]:
drop_table_if_exists(table="interaction")
drop_table_if_exists(table="protein")

In [8]:
sql = """CREATE TABLE IF NOT EXISTS `protein` (
  `id` bigint,
  `accession` text,
  `name` text,
  `taxid` bigint,
  KEY `ix_protein_id` (`id`)
)"""
cursor.execute(sql)
conn.commit()

In [9]:
sql = """CREATE TABLE IF NOT EXISTS `interaction` (
  `id` bigint,
  `confidence_value` double DEFAULT NULL,
  `detection_method` VARCHAR(255),
  `interaction_type` VARCHAR(255),
  `pmid` double DEFAULT NULL,
  `protein_a_id` bigint,
  `protein_b_id` bigint,
  KEY `ix_interaction_id` (`id`),
  FOREIGN KEY (`protein_a_id`) REFERENCES protein(`id`),
  FOREIGN KEY (`protein_b_id`) REFERENCES protein(`id`)
)"""
cursor.execute(sql)

0

## Read the data from file and write into database

### Proteins

In [10]:
df_protein: pd.DataFrame = pd.read_csv(
    "./data/protein.tsv.zip",
    sep="\t",
    names=["id", "accession", "name", "taxid"],
    index_col="id",
)
df_protein.head()

Unnamed: 0_level_0,accession,name,taxid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A0A0B4J2F0,Protein PIGBOS1,9606
2,A0A0H3JRU9,Pyruvate carboxylase,158878
3,A0A0H3K9R3,RuBisCO chaperone RbcX,269084
4,A0A0U1RRE5,Negative regulator of P-body association,9606
5,A0A1L8F5J9,"Glutamate receptor ionotropic, NMDA 1",8355


In [11]:
df_protein.to_sql(name="protein", con=engine, if_exists="append")

13438

### Interactions

In [12]:
df_interaction: pd.DataFrame = pd.read_csv(
    "./data/interaction.tsv.zip",
    sep="\t",
    names=[
        "id",
        "confidence_value",
        "detection_method",
        "interaction_type",
        "pmid",
        "protein_a_id",
        "protein_b_id",
    ],
    index_col="id",
)
df_interaction.head()

Unnamed: 0_level_0,confidence_value,detection_method,interaction_type,pmid,protein_a_id,protein_b_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.63,anti tag coimmunoprecipitation,physical association,31653868.0,1,10919
2,0.86,3D electron microscopy,direct interaction,24882745.0,2,2
3,0.6,cross-linking study,direct interaction,21765418.0,3,1422
4,0.64,anti tag coimmunoprecipitation,association,27918561.0,4,8998
5,0.61,pull down,physical association,25008524.0,5,67


In [13]:
df_interaction.to_sql(name="interaction", con=engine, if_exists="append")

39999

## Neo4j

In [14]:
NEO_PASSWORD = "neo4j_passwd"
NEO_URI = "bolt://localhost:7687"
NEO_USER = "neo4j"
NEO_DB = "neo4j"

In [15]:
def get_cypher_props(props: dict) -> str:
    """Convert dictionary to cypher compliant properties as string."""
    props_str: str = ""
    props_array: list[str] = []
    if props:
        for k, v in props.items():
            if (isinstance(v, (str, int, list)) and v) or (
                isinstance(v, float) and not np.isnan(v)
            ):
                cypher_str: str = f"`{k}`: " + json.dumps(v)
                props_array.append(cypher_str)
        if props_array:
            props_str = "{" + ", ".join(props_array) + "}"
    return props_str

In [16]:
neo_driver: neo4j.Driver = neo4j.GraphDatabase.driver(
    uri=NEO_URI,
    auth=(NEO_USER, NEO_PASSWORD),
    database=NEO_DB,
)
neo_session: neo4j.Session = neo_driver.session()

In [17]:
# delete all node and relationships
neo_session.run("MATCH (n) DETACH DELETE n")

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)

In [None]:
# add nodes
cypher_nodes: list[str] = []
df: pd.DataFrame = pd.read_sql_table(table_name="protein", con=engine)
for idx, props in tqdm(df.iterrows(), total=df.shape[0]):
    cypher_props: str = get_cypher_props(props=dict(props))
    cypher_nodes.append("(:Protein " + cypher_props + ")")

neo_session.run("CREATE " + ",".join(cypher_nodes))

<neo4j._sync.work.result.Result at 0x7f3a3457d8d0>

In [None]:
df: pd.DataFrame = pd.read_sql_table(table_name="interaction", con=engine)
for idx, props in tqdm(df.iterrows(), total=df.shape[0]):
    cypher_props: str = get_cypher_props(props=dict(props))
    cypher = f"""MATCH
        (protein_a:Protein {{id: {props.protein_a_id}}}),
        (protein_b:Protein {{id: {props.protein_b_id}}})
        MERGE (protein_a)-[r:INTERACTS_WITH {cypher_props}]->(protein_b)"""
    neo_session.run(cypher)

After all this open http://localhost:7474 and log in with user: neo4j and password: neo4j_passwd.