# Homework week5

**author:** Mehmet Can Ay <br>
2023-11-23

In [1]:
## uncomment this if needed
#!pip install -r requirements.txt

## Import

In [2]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Getting the Data

### Pathway Table

In [3]:
list_of_table: list[pd.DataFrame] = pd.read_html("https://www.wikipathways.org/browse/table.html")
pathways: pd.DataFrame = list_of_table[0]
pathways.rename(columns={0: "Pathway Title", 
                      1: "ID", 
                      2: "Organism", 
                      3: "Last Edited", 
                      4: "Communities", 
                      5: "Pathway Terms", 
                      6: "Disease Terms", 
                      7: "Cell Types"}, inplace=True)

### Pathway Components for Homo Sapiens

Homo sapiens database is downloaded from WikiPathways's [Downloads](https://data.wikipathways.org/current/gpml/) section. Unfortunately the file does not follow conventions of .xml file and therefore, the reading of the file is troublesome.

In [4]:
# An empty list to hold dataframes
dfs: list[pd.DataFrame] = []

# A path to database folder
xml_folder_path: str = "./data/wikipathways/"

# Extracting the names of .XML files
xml_files: list[str] = [file for file in os.listdir(xml_folder_path)]

# Creating a dataframe with each .XML file and appending them to the list of dfs
for xml in xml_files:
    path: str = os.path.join(xml_folder_path, xml)
    df: pd.DataFrame = pd.read_xml(path, namespaces={"doc": "http://pathvisio.org/GPML/2013a"})
    dfs.append(df)

# Concatanating all dfs in the list
homo_sapiens: pd.DataFrame = pd.concat(dfs)

# Resetting the index of the database dataframe
homo_sapiens.reset_index(drop=True, inplace=True)

# Replacing artifacts with NaN.
homo_sapiens.replace({"\n      ": np.nan}, inplace=True)

## Exporting as .csv file

In [5]:
# Saving the pathways as .csv file
pathways.to_csv("./data/pathways.csv", index=False)

In [6]:
# Selecting the first 10000 rows
homo_sapiens = homo_sapiens.iloc[0: 10000]

# Saving the component as .csv file
homo_sapiens.to_csv("./data/homo_sapiens.csv", index=False)

del pathways
del homo_sapiens

## Loading the Data with Pandas

In [7]:
# Reading the sampled database
pathways: pd.DataFrame = pd.read_csv("./data/pathways.csv")

In [8]:
# Reading the sampled database
homo_sapiens: pd.DataFrame = pd.read_csv("./data/homo_sapiens.csv")

# For simplyfing the dataframe, all columns that contains only NaN values dropped.
homo_sapiens.dropna(axis=1, how="all", inplace=True)

# Removing '\n' from the entire DataFrame
homo_sapiens = homo_sapiens.map(lambda x: x.replace('\n', '') if isinstance(x, str) else x)

## Saving as SQL Database

In [9]:
# Creating an SQL database
engine = create_engine("sqlite:///data/pathways.db", echo=False)

# Writing to the creted SQL database
pathways.to_sql("pathways", con=engine, index=False)

1922

In [10]:
# Creating an SQL database
engine = create_engine("sqlite:///data/homo_sapiens.db", echo=False)

# Writing to the creted SQL database
homo_sapiens.to_sql("homo_sapiens", con=engine, index=False)

del pathways
del homo_sapiens

## Opening the Database with SQL

In [11]:
%%capture
%load_ext sql
%sql sqlite:///data/pathways.db

In [12]:
%%sql
SELECT * from pathways where Organism IS 'Homo sapiens' LIMIT 10

 * sqlite:///data/pathways.db
Done.


Pathway Title,ID,Organism,Last Edited,Communities,Pathway Terms,Disease Terms,Cell Types
10q11.21q11.23 copy number variation syndrome,WP5352,Homo sapiens,04 Aug 2023,"Diseases, RareDiseases",disease pathway,,
10q22q23 copy number variation,WP5402,Homo sapiens,18 Aug 2023,,disease pathway,"chromosomal duplication syndrome, chromosomal deletion syndrome, genetic disease",
11p11.2 copy number variation syndrome,WP5348,Homo sapiens,05 Aug 2023,"Diseases, RareDiseases",disease pathway,,
13q12 or CRYL1 copy number variation,WP5405,Homo sapiens,07 Aug 2023,,disease pathway,"chromosomal duplication syndrome, chromosomal deletion syndrome, genetic disease",
13q12.12 copy number variation,WP5406,Homo sapiens,08 Aug 2023,,disease pathway,"chromosomal deletion syndrome, chromosomal duplication syndrome, genetic disease",
15q11.2 copy number variation syndrome,WP4940,Homo sapiens,18 Jan 2023,RareDiseases,disease pathway,"genetic disease, chromosome 15q11.2 deletion syndrome",
15q11q13 copy number variation,WP5407,Homo sapiens,10 Aug 2023,,disease pathway,"chromosomal deletion syndrome, chromosomal duplication syndrome, genetic disease",
15q13.3 copy number variation syndrome,WP4942,Homo sapiens,12 Mar 2021,RareDiseases,disease pathway,"chromosome 15q13.3 microdeletion syndrome, genetic disease",
15q25 copy number variation,WP5408,Homo sapiens,15 Aug 2023,,disease pathway,"chromosome 15q25 deletion syndrome, genetic disease",
16p11.2 distal deletion syndrome,WP4950,Homo sapiens,18 Jan 2023,RareDiseases,disease pathway,"chromosome 16p11.2 deletion syndrome, genetic disease",


In [13]:
%sql sqlite:///data/homo_sapiens.db

Because the .xml file was not following the conventions of xml, the informations is incomplete. Here the filtering should have been based on Version (not shown in the table) rather than Comment column.

In [14]:
%%sql
SELECT * from homo_sapiens where Comment LIKE "10q11.21q11.23%" LIMIT 10

 * sqlite:///data/homo_sapiens.db
   sqlite:///data/pathways.db
Done.


Source,Comment,BoardWidth,BoardHeight,TextLabel,Type,GraphId,GroupRef,BiopaxRef,GroupId,Style,CenterX,CenterY,GraphRef,Href
WikiPathways-description,"10q11.21q11.23 copy number variation (CNV) syndrome is a rare genetic disorder caused by a deletion or duplication of genetic material on chromosome 10. The exact genetic location chr10:49,390,199-51,058,796 (GRCh37) was taken from Kirov et al. 2014 and literature cited there.",,,,,,,,,,,,,


## How the data should have look

![How the table should have look 1](./images/correct_table_1.png)

![How the table should have look 2](./images/correct_table_2.png)