## SQL With Pandas CSV Data

In [1]:
#python imports
import csv
import pandas as pd
import sqlite3
import sqlalchemy

In [2]:
%load_ext sql
%sql sqlite://

In [3]:
# Upon uploading our file, we will then load our first table from the CSV file as shown 
# We are going to use one of pandas function which is read_csv(). This function helps us to load data from csv files into python. It takes a several arguments/ parameters but we are going to use only a few for now. 
# The first arugument is the path of the csv file. This tells the function where your csv is located and the name of the csv. It can also be a url.
# The second argument, index_col, tells pandas to use the row indexes as the row labels.

# Kaggle data ~ https://www.kaggle.com/datasets/elnahas/gender-inequality-in-hiv-infections-in-adolescents?resource=download
df_hiv = pd.read_csv('data/Jan2021-GenderInequalityHIVAIDS.csv',encoding='latin-1')

In [4]:
# PRE-CLEANING
df_hiv.head()

Unnamed: 0,Country,UNICEF Region,Year,Sex,Age,Estimated incidence rate of new HIV infection per 1 000 uninfected population,Estimated number of annual AIDS related deaths,Estimated number of annual new HIV infections,Estimated number of people living with HIV,Estimated rate of annual AIDS related deaths per 100 000 population
0,Angola,Eastern and Southern Africa,1990,Female,Age 10-19,0.64,100.0,500.0,860,0.36
1,Angola,Eastern and Southern Africa,1990,Male,Age 10-19,0.15,100.0,100.0,200,0.07
2,Angola,Eastern and Southern Africa,1991,Female,Age 10-19,0.75,100.0,500.0,1100,0.42
3,Angola,Eastern and Southern Africa,1991,Male,Age 10-19,0.17,100.0,200.0,500,0.14
4,Angola,Eastern and Southern Africa,1992,Female,Age 10-19,0.88,100.0,590.0,1300,0.54


In [5]:
# print columns
df_hiv.columns.to_list()

['Country',
 'UNICEF Region',
 'Year',
 'Sex',
 'Age',
 'Estimated incidence rate of new HIV infection per 1 000 uninfected population ',
 'Estimated number of annual AIDS related deaths',
 'Estimated number of annual new HIV infections',
 'Estimated number of people living with HIV',
 'Estimated rate of annual AIDS related deaths  per 100 000 population ']

* We will then drop existing tables similar to the one that we will work with
* in our current sqlite environment. Then push our Interpol data into a new 
* table within our current SQLite Database 

In [6]:
%sql DROP TABLE if EXISTS df_hiv;


 * sqlite://
Done.


[]

In [7]:
%sql --persist df_hiv

 * sqlite://


'Persisted df_hiv'

In [8]:
%%sql 
SELECT * FROM df_hiv
LIMIT 10;

 * sqlite://
Done.


index,Country,UNICEF Region,Year,Sex,Age,Estimated incidence rate of new HIV infection per 1 000 uninfected population,Estimated number of annual AIDS related deaths,Estimated number of annual new HIV infections,Estimated number of people living with HIV,Estimated rate of annual AIDS related deaths per 100 000 population
0,Angola,Eastern and Southern Africa,1990,Female,Age 10-19,0.64,100.0,500.0,860,0.36
1,Angola,Eastern and Southern Africa,1990,Male,Age 10-19,0.15,100.0,100.0,200,0.07
2,Angola,Eastern and Southern Africa,1991,Female,Age 10-19,0.75,100.0,500.0,1100,0.42
3,Angola,Eastern and Southern Africa,1991,Male,Age 10-19,0.17,100.0,200.0,500,0.14
4,Angola,Eastern and Southern Africa,1992,Female,Age 10-19,0.88,100.0,590.0,1300,0.54
5,Angola,Eastern and Southern Africa,1992,Male,Age 10-19,0.19,100.0,200.0,500,0.14
6,Angola,Eastern and Southern Africa,1993,Female,Age 10-19,1.04,100.0,720.0,1600,0.65
7,Angola,Eastern and Southern Africa,1993,Male,Age 10-19,0.23,100.0,200.0,500,0.2
8,Angola,Eastern and Southern Africa,1994,Female,Age 10-19,1.19,100.0,850.0,2000,0.75
9,Angola,Eastern and Southern Africa,1994,Male,Age 10-19,0.26,100.0,200.0,500,0.25


In [9]:
%%sql
---#get max and min year from the dataset
SELECT MAX(Year) as max_yr, MIN(Year) as min_yr FROM df_hiv;

 * sqlite://
Done.


max_yr,min_yr
2019,1990


In [10]:
%%sql
--- rename columns
ALTER TABLE df_hiv
RENAME COLUMN 'UNICEF Region' TO 'UR';
ALTER TABLE df_hiv
RENAME COLUMN 'Estimated incidence rate of new HIV infection per 1 000 uninfected population ' TO 'E_R_NI_per100';
ALTER TABLE df_hiv
RENAME COLUMN 'Estimated number of annual AIDS related deaths' TO 'E_NO_Ann_AidsRelDeaths';
ALTER TABLE df_hiv
RENAME COLUMN 'Estimated number of annual new HIV infections' TO 'E_NO_Ann_New_AidsInfect';
ALTER TABLE df_hiv
RENAME COLUMN 'Estimated number of people living with HIV' TO 'E_NO_Ann_LivingW_Aids';
ALTER TABLE df_hiv
RENAME COLUMN 'Estimated rate of annual AIDS related deaths  per 100 000 population ' TO 'E_R_Deaths_per100K';

 * sqlite://
Done.
Done.
Done.
Done.
Done.
Done.


[]

In [11]:
%%sql
-- # get data where country is kenya and order in desc order
SELECT Country,"UNICEF Region" as UR,Year,Sex,Age FROM df_hiv
WHERE Country = 'Kenya' 
ORDER BY Year DESC
LIMIT 10;

 * sqlite://
Done.


Country,UR,Year,Sex,Age
Kenya,UNICEF Region,2019,Female,Age 10-19
Kenya,UNICEF Region,2019,Male,Age 10-19
Kenya,UNICEF Region,2018,Female,Age 10-19
Kenya,UNICEF Region,2018,Male,Age 10-19
Kenya,UNICEF Region,2017,Female,Age 10-19
Kenya,UNICEF Region,2017,Male,Age 10-19
Kenya,UNICEF Region,2016,Female,Age 10-19
Kenya,UNICEF Region,2016,Male,Age 10-19
Kenya,UNICEF Region,2015,Female,Age 10-19
Kenya,UNICEF Region,2015,Male,Age 10-19


---

* Checking datatypes in our database table

---

In [12]:
%%sql
PRAGMA table_info(df_hiv);

 * sqlite://
Done.


cid,name,type,notnull,dflt_value,pk
0,index,BIGINT,0,,0
1,Country,TEXT,0,,0
2,UR,TEXT,0,,0
3,Year,BIGINT,0,,0
4,Sex,TEXT,0,,0
5,Age,TEXT,0,,0
6,E_R_NI_per100,FLOAT,0,,0
7,E_NO_Ann_AidsRelDeaths,FLOAT,0,,0
8,E_NO_Ann_New_AidsInfect,FLOAT,0,,0
9,E_NO_Ann_LivingW_Aids,BIGINT,0,,0


In [13]:
%%sql
---# check distinct countries in our dataset
SELECT DISTINCT Country FROM df_hiv
ORDER BY Country ASC
LIMIT 10

 * sqlite://
Done.


Country
Angola
Benin
Burkina Faso
Burundi
Cameroon
Central African Republic
Chad
Congo
Côte d'Ivoire
Democratic Republic of the Congo


### Saving Data to A Variable

In [14]:
countries = %sql SELECT DISTINCT Country FROM df_hiv ORDER BY Country DESC LIMIT 10;

 * sqlite://
Done.


In [15]:
countries

Country
Zimbabwe
Zambia
United Republic of Tanzania
Uganda
Togo
Sudan
South Sudan
South Africa
Somalia
Sierra Leone


In [16]:
%%sql
---# view first 10 rows
SELECT * FROM df_hiv
LIMIT 10;

 * sqlite://
Done.


index,Country,UR,Year,Sex,Age,E_R_NI_per100,E_NO_Ann_AidsRelDeaths,E_NO_Ann_New_AidsInfect,E_NO_Ann_LivingW_Aids,E_R_Deaths_per100K
0,Angola,Eastern and Southern Africa,1990,Female,Age 10-19,0.64,100.0,500.0,860,0.36
1,Angola,Eastern and Southern Africa,1990,Male,Age 10-19,0.15,100.0,100.0,200,0.07
2,Angola,Eastern and Southern Africa,1991,Female,Age 10-19,0.75,100.0,500.0,1100,0.42
3,Angola,Eastern and Southern Africa,1991,Male,Age 10-19,0.17,100.0,200.0,500,0.14
4,Angola,Eastern and Southern Africa,1992,Female,Age 10-19,0.88,100.0,590.0,1300,0.54
5,Angola,Eastern and Southern Africa,1992,Male,Age 10-19,0.19,100.0,200.0,500,0.14
6,Angola,Eastern and Southern Africa,1993,Female,Age 10-19,1.04,100.0,720.0,1600,0.65
7,Angola,Eastern and Southern Africa,1993,Male,Age 10-19,0.23,100.0,200.0,500,0.2
8,Angola,Eastern and Southern Africa,1994,Female,Age 10-19,1.19,100.0,850.0,2000,0.75
9,Angola,Eastern and Southern Africa,1994,Male,Age 10-19,0.26,100.0,200.0,500,0.25
