# Import dataset to the database

In [1]:
from tqdm.auto import tqdm
import pandas as pd
from sqlalchemy import create_engine

  from .autonotebook import tqdm as notebook_tqdm


Create a database connection:

In [2]:
engine = create_engine('mysql+mysqlconnector://root:@localhost:3306/snpedia_db')

## SNPs

SNP relation contains information from two files: snps.csv and rsnums.csv
load data from these files into a Pandas DataFrames:

In [3]:
snps_df = pd.read_csv('dataset/snps.csv', index_col=0)
rsnums_df = pd.read_csv('dataset/rsnums.csv', index_col=0)

  rsnums_df = pd.read_csv('dataset/rsnums.csv', index_col=0)


Merge these DataFrames into a single DataFrame:

In [4]:
df = pd.merge(snps_df, rsnums_df, how="left", left_index=True, right_index=True)

Drop unused columns, rename columns to match the database schema:

In [5]:
df = df.drop(columns=["gene"]).rename(columns={"Description": "description", "Gene": "gene", "Chromosome": "chromosome", "Position": "position"})[["description", "chromosome", "gene", "position"]]

Write the DataFrame to the database:


In [7]:
df.to_sql("SNP", con=engine, if_exists='append', index_label="id")

111725

## Genotypes

Read genotypes from the file:

In [8]:
df = pd.read_csv('dataset/genotypes.csv', index_col=0)

Inner join with SNPs to get rid of the records with unknown SNPs (for some reason, there are a few of them):

In [9]:
df = pd.merge(df, snps_df, how="inner", left_index=True, right_index=True)

Coy index to the column snp_id (foreign key):

In [10]:
df["snp_id"] = df.index

Fix column names to match the database schema:

In [11]:
df = df.drop(columns=["description"]).rename(columns={"Description": "description"})[["snp_id", "allele1", "allele2", "magnitude", "repute", "summary", "description"]]

Remove rows without allele1 column (these are the rows with unknown genotypes):

In [12]:
df = df[df['snp_id'].notna() & df['allele1'].notna()]

Lowercase the repute column to match the database schema:

In [13]:
df["repute"] = df["repute"].str.lower()

Save the DataFrame to the database:

In [14]:
df.to_sql("Genotype", con=engine, if_exists='append', index=False)

104689

## Categories

In [23]:
df = pd.read_csv('dataset/categories.csv')

In [29]:
categories_df = pd.DataFrame(df["name"][df["name"].notna()].unique(), columns=["name"])

In [30]:
categories_df.index += 1

In [31]:
categories_df.index.name = "id"

In [32]:
categories_df

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
1,On chip 23andMe v1
2,On chip 23andMe v2
3,On chip 23andMe v3
4,On chip 23andMe v4
5,On chip 23andMe v5
6,On chip Affy GenomeWide 6
7,On chip Affy500k
8,On chip Ancestry v2
9,On chip HumanOmni1Quad
10,On chip Illumina Human 1M


In [19]:
categories_df.to_sql("Category", con=engine, if_exists='append')

16

## SNP_Category

In [36]:
categories_df["category_id"] = categories_df.index

In [38]:
df = pd.merge(df, categories_df, how="inner", left_on="name", right_on="name")

In [42]:
df = df.rename(columns={"ID": "snp_id"}).drop(columns=["name"])

In [51]:
df.drop_duplicates(subset=["snp_id", "category_id"], inplace=True)

In [52]:
df.to_sql("SNP_Category", con=engine, if_exists='append', index=False)

363060