<a href="https://colab.research.google.com/github/maya-papaya/ads1-cervical-cancer-analysis/blob/main/First%20Semester%20Project%3A%20Creating%20Relational%20Database%20(3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating Relational Database

In this notebook, I created a relational database consisting of the datasets I had cleaned/preprocessed in `First Semester Project: Data Cleaning & Preprocessing (2).ipynb`. When assigning primary keys, I occasionally used two columns in order to ensure uniqueness; the code for this process was sourced from https://tableplus.com/blog/2018/04/sqlite-specify-primary-key-on-multiple-columns.html#:~:text=How%20to%20specify%20primary%20key,KEY%20(column1%2C%20column2)%20)%3B.

In [None]:
# SETTING UP DRIVE AND MODULES
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import os
os.chdir("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/")

Mounted at /content/drive


In [None]:
# LOADING DATASETS
data = pd.read_csv('data.csv')
us_data = pd.read_csv('us_data.csv')
pap = pd.read_csv('pap.csv')
hpv = pd.read_csv('hpv.csv')
adolescent = pd.read_csv('adolescent.csv')
demo = pd.read_csv('demo.csv')

In [None]:
# CREATING DATABASE TABLES
import sqlite3
connection = sqlite3.connect('cervical_cancer.db')
cursor = connection.cursor()

# CREATE THE 'data' TABLE
cursor.execute('''
CREATE TABLE IF NOT EXISTS data (
    state TEXT NOT NULL,
    measure TEXT NOT NULL,
    metric TEXT NOT NULL,
    year INTEGER NOT NULL,
    val FLOAT NOT NULL,
    upper FLOAT NOT NULL,
    lower FLOAT NOT NULL,
    PRIMARY KEY (state, year)
)''')
data.to_sql('data', connection, if_exists='replace', index=False)

# CREATE THE 'us_data' TABLE
cursor.execute('''
CREATE TABLE IF NOT EXISTS us_data (
    year PRIMARY KEY,
    state TEXT NOT NULL,
    measure TEXT NOT NULL,
    metric TEXT NOT NULL,
    val FLOAT NOT NULL,
    upper FLOAT NOT NULL,
    lower FLOAT NOT NULL
)''')
us_data.to_sql('us_data', connection, if_exists='replace', index=False)

# CREATE THE 'pap' TABLE
cursor.execute('''
CREATE TABLE IF NOT EXISTS pap (
    city TEXT NOT NULL,
    pop_2010, INTEGER NOT NULL,
    val FLOAT NOT NULL,
    lower FLOAT NOT NULL,
    upper FLOAT NOT NULL,
    year INTEGER NOT NULL,
    FOREIGN KEY (state) REFERENCES data(state),
    PRIMARY KEY (city, year)
)''')
pap.to_sql('pap', connection, if_exists='replace', index=False)

# CREATE THE 'hpv' TABLE
cursor.execute('''
CREATE TABLE IF NOT EXISTS hpv (
    year PRIMARY KEY,
    cohort_size FLOAT NOT NULL,
    current_cov FLOAT NOT NULL,
    curr_vacc_cohort_size FLOAT NOT NULL,
    future_cov FLOAT NOT NULL,
    future_vacc_cohort_size FLOAT NOT NULL,
    curr_cc_prev FLOAT NOT NULL,
    curr_mort_prev FLOAT NOT NULL,
    curr_cost FLOAT NOT NULL,
    curr_cost_prev FLOAT NOT NULL,
    proj_cc_prev FLOAT NOT NULL,
    proj_mort_prev FLOAT NOT NULL,
    proj_cost FLOAT NOT NULL,
    current_net_cost FLOAT NOT NULL,
    country TEXT NOT NULL
)''')
hpv.to_sql('hpv', connection, if_exists='replace', index=False)

# CREATE THE 'demo' TABLE
cursor.execute('''
CREATE TABLE IF NOT EXISTS demo (
    state TEXT NOT NULL,
    year TEXT NOT NULL,
    dimension_type TEXT NOT NULL,
    dimension_val TEXT NOT NULL,
    val FLOAT NOT NULL,
    FOREIGN KEY (state) REFERENCES data(state),
    PRIMARY KEY (state, year, dimension_type, dimension_val)
)''')
demo.to_sql('demo', connection, if_exists='replace', index=False)

# CREATE THE 'adolescent' TABLE
cursor.execute('''
CREATE TABLE IF NOT EXISTS adolescent (
    state TEXT NOT NULL,
    year TEXT NOT NULL,
    dose TEXT NOT NULL,
    dimension_type TEXT NOT NULL,
    dimension_val TEXT NOT NULL,
    val FLOAT NOT NULL,
    lower FLOAT NOT NULL,
    upper FLOAT NOT NULL,
    FOREIGN KEY (state) REFERENCES data(state),
    PRIMARY KEY (state, year, dose)
)''')
adolescent.to_sql('adolescent', connection, if_exists='replace', index=False)

# COMMIT TO DATABASE
connection.commit()

In [None]:
# CHECKING DATABASE
pd.read_sql('SELECT * FROM data', connection)

Unnamed: 0,state,year,mort_count,mort_rate,inc_count,inc_rate
0,Wyoming,1990,7.658057,3.322874,40.244999,17.462532
1,Louisiana,1990,119.025223,5.335498,405.005586,18.155029
2,Oklahoma,1990,83.454526,5.075913,417.677964,25.404219
3,California,1990,622.882547,4.100669,3279.998897,21.593463
4,Nebraska,1991,32.809487,3.968637,158.043604,19.116960
...,...,...,...,...,...,...
1627,Utah,2020,36.911208,2.257494,230.738216,14.111976
1628,New Hampshire,2020,27.911231,4.001137,109.925239,15.758028
1629,Idaho,2020,28.334987,3.067066,106.738541,11.553709
1630,Montana,2020,19.477699,3.616655,73.401450,13.629318


In [None]:
# CLOSING CONNECTION
connection.close()