Data Analysis with Pandas and SQL

In [1]:
import numpy as np
import pandas as pd

In [2]:
# we can manually install tools
# outside of anaconda we write pip install pandasql
# !conda install pandasql --yes

In [3]:
# we import the sql tools like this
from pandasql import sqldf

In [4]:
# we need some data
students= {
    'Students':["Sira","Ibrahim","Moussa","Mamadou","Nabintou"],
    'Gender':['Female','Male','Fluid', "Male", "Female"],
    'Age':[18, 27, 19, 22, 21],
    'Email': ["sira@info.com", "ib@info.com", "mouss@info.com", 
             "mam@info.com", "nab@info.com"]
          }
teaching_assistant= {
    'Teacher':["Ibrahim","Nabintou","Mamadou","Fatim","Aziz"],
    'Email':['ib@info.com','nab@info.com','mam@info.com', 
             "fat@info.com", "aziz@info.com"],
    'Degree':["MSc. in Data Science", "BSc. in Statistics", 
              "BSc. Comp Sci", "MSc. Architecture", "BSc. in Accounting"],
    'Department': ["Business", "Statistics", "Comp Sci", 
             "Engineering", "Business"]
          }

In [6]:
# make them into DataFrames
students_df = pd.DataFrame(students)
students_df

Unnamed: 0,Students,Gender,Age,Email
0,Sira,Female,18,sira@info.com
1,Ibrahim,Male,27,ib@info.com
2,Moussa,Fluid,19,mouss@info.com
3,Mamadou,Male,22,mam@info.com
4,Nabintou,Female,21,nab@info.com


In [7]:
teaching_assistant_df = pd.DataFrame(teaching_assistant)
teaching_assistant_df

Unnamed: 0,Teacher,Email,Degree,Department
0,Ibrahim,ib@info.com,MSc. in Data Science,Business
1,Nabintou,nab@info.com,BSc. in Statistics,Statistics
2,Mamadou,mam@info.com,BSc. Comp Sci,Comp Sci
3,Fatim,fat@info.com,MSc. Architecture,Engineering
4,Aziz,aziz@info.com,BSc. in Accounting,Business


We will select data, filter data, aggregate and join data using SQL on DataFrames

In [23]:
# CAREFUL - avoid keywords such as 'all'
all_s = sqldf("SELECT * FROM students_df") # * means all columns
# all_s = sqldf("SELECT Age, Email FROM students_df") # we can specify which clumns
all_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Students  5 non-null      object
 1   Gender    5 non-null      object
 2   Age       5 non-null      int64 
 3   Email     5 non-null      object
dtypes: int64(1), object(3)
memory usage: 288.0+ bytes


In [21]:
# select a limited subset
query = "SELECT Students, Email FROM all_s LIMIT 3"
# query = "SELECT Students, Email FROM students_df LIMIT 3"
subset = sqldf(query)
subset # we have a dataframe

Unnamed: 0,Students,Email
0,Sira,sira@info.com
1,Ibrahim,ib@info.com
2,Moussa,mouss@info.com


In [24]:
# we can use "where" to filter results CAREFUL SQL excpets single quotes
query = "SELECT * FROM students_df WHERE Gender='Female' "
fem= sqldf(query)
fem

Unnamed: 0,Students,Gender,Age,Email
0,Sira,Female,18,sira@info.com
1,Nabintou,Female,21,nab@info.com


In [30]:
# NB us triple-quotes so we can put new lines in our statement
query = """SELECT Teacher, Email, Degree 
FROM teaching_assistant_df 
WHERE Degree LIKE 'MS%' """
masters = sqldf(query)
masters

Unnamed: 0,Teacher,Email,Degree
0,Ibrahim,ib@info.com,MSc. in Data Science
1,Fatim,fat@info.com,MSc. Architecture


In [31]:
# We can aggregate
query ="""
SELECT AVG(Age), Gender
FROM students_df
GROUP BY Gender
"""
av_age = sqldf(query)
av_age

Unnamed: 0,AVG(Age),Gender
0,19.5,Female
1,19.0,Fluid
2,24.5,Male


In [33]:
# What would this look like in Pandas?
mean_age = students_df.groupby('Gender')["Age"].mean()
mean_age

Gender
Female    19.5
Fluid     19.0
Male      24.5
Name: Age, dtype: float64

In [None]:
# Remember - we are using SQL agains DataFrames NOT a DataBase

In [37]:
# Here is an inner join in SQL
# Find all members that are both students and teaching assistants
# query = """
# SELECT students_df.Students, students_df.Gender, students_df.Email,
# teaching_assistant_df.Department
# FROM students_df INNER JOIN teaching_assistant_df
# ON students_df.Email = teaching_assistant_df.Email
# """
# or
query = """
SELECT st.Students, st.Gender, st.Email, ta.Department
FROM students_df st INNER JOIN teaching_assistant_df ta
ON st.Email = ta.Email
"""

result = sqldf(query)
result

Unnamed: 0,Students,Gender,Email,Department
0,Ibrahim,Male,ib@info.com,Business
1,Mamadou,Male,mam@info.com,Comp Sci
2,Nabintou,Female,nab@info.com,Statistics


### Review 3 - DB

In [38]:
import sqlite3

In [39]:
# we only need ot run this ONCE to make the table
conn = sqlite3.connect('mydb')
curs = conn.cursor()
st = '''
CREATE TABLE students
(
    email VARCHAR(32) PRIMARY KEY,
    gender VARCHAR(32),
    student VARCHAR(32),
    age INT
)
'''
curs.execute(st)
conn.commit() # commit all changes
conn.close() # tidy up

In [40]:
# we only need ot run this ONCE to make the table
conn = sqlite3.connect('mydb')
curs = conn.cursor()
st = '''
CREATE TABLE assistants
(
    email VARCHAR(32) PRIMARY KEY,
    degree VARCHAR(32),
    teacher VARCHAR(32),
    department VARCHAR(32)
)
'''
curs.execute(st)
conn.commit() # commit all changes
conn.close() # tidy up

In [72]:
# populate the students table
# we already have a DataFrame called students_df
# prepare our database access
conn = sqlite3.connect('mydb')
curs = conn.cursor()
st   = '''
INSERT INTO students
VALUES (?, ?, ?, ?)
'''
# CAREFUL - only run this ONCE to populate the database
# we can loop over the dataframe items (iteration)
num_students = students_df['Students'].nunique()
for i in range(num_students):
    e  = students_df.loc[i]['Email']
    s  = students_df.loc[i]['Students']
    a  = students_df.loc[i]['Age']
    g  = students_df.loc[i]['Gender']
    curs.execute(st, (e, g, s, a))
    conn.commit()
conn.close() # tidy up 

IntegrityError: UNIQUE constraint failed: students.email

In [73]:
# Now read back the students fro mteh DB
conn = sqlite3.connect('mydb')
curs = conn.cursor()
st = '''
SELECT student, gender, age, email FROM students
'''
curs.execute(st)
# we can get the results of this
rows = curs.fetchall() # this gets the results of our executed statement
conn.commit()
conn.close()
rows

[('Sira', 'Female', b'\x12\x00\x00\x00\x00\x00\x00\x00', 'sira@info.com'),
 ('Ibrahim', 'Male', b'\x1b\x00\x00\x00\x00\x00\x00\x00', 'ib@info.com'),
 ('Moussa', 'Fluid', b'\x13\x00\x00\x00\x00\x00\x00\x00', 'mouss@info.com'),
 ('Mamadou', 'Male', b'\x16\x00\x00\x00\x00\x00\x00\x00', 'mam@info.com'),
 ('Nabintou', 'Female', b'\x15\x00\x00\x00\x00\x00\x00\x00', 'nab@info.com')]

In [74]:
teaching_assistant_df

Unnamed: 0,Teacher,Email,Degree,Department
0,Ibrahim,ib@info.com,MSc. in Data Science,Business
1,Nabintou,nab@info.com,BSc. in Statistics,Statistics
2,Mamadou,mam@info.com,BSc. Comp Sci,Comp Sci
3,Fatim,fat@info.com,MSc. Architecture,Engineering
4,Aziz,aziz@info.com,BSc. in Accounting,Business


In [83]:
# Do it all again, for the Teaching Assistants
# ... write all the teaching_assistant_df data into the DB
# prepare our database access
conn = sqlite3.connect('mydb')
curs = conn.cursor()
st   = '''
INSERT INTO assistants
VALUES (?, ?, ?, ?)
'''
# CAREFUL - only run this ONCE to populate the database
num_assistants = teaching_assistant_df['Teacher'].nunique()
for i in range(num_assistants):
    e  = teaching_assistant_df.loc[i]['Email']
    deg  = teaching_assistant_df.loc[i]['Degree']
    t  = teaching_assistant_df.loc[i]['Teacher']
    dep  = teaching_assistant_df.loc[i]['Department']
    curs.execute(st, (e, deg, t, dep))
    conn.commit()
conn.close() # tidy up 

In [87]:
# .. and read back the TAs from the DB
conn = sqlite3.connect('mydb')
curs = conn.cursor()
st = '''
SELECT teacher, email, degree, department FROM assistants
'''
curs.execute(st)
# we can get the results of this
rows = curs.fetchall() # this gets the results of our executed statement
conn.commit()
conn.close()
rows

[('Ibrahim', 'ib@info.com', 'MSc. in Data Science', 'Business'),
 ('Nabintou', 'nab@info.com', 'BSc. in Statistics', 'Statistics'),
 ('Mamadou', 'mam@info.com', 'BSc. Comp Sci', 'Comp Sci'),
 ('Fatim', 'fat@info.com', 'MSc. Architecture', 'Engineering'),
 ('Aziz', 'aziz@info.com', 'BSc. in Accounting', 'Business')]

In [99]:
# Challenge - read back from BOTH tables but only where email matches
conn = sqlite3.connect('mydb')
curs = conn.cursor()
st = '''
SELECT  t.teacher, t.email, s.student, s.email FROM students s, assistants t
WHERE t.email = s.email
'''
curs.execute(st)
# we can get the results of this
rows = curs.fetchall() # this gets the results of our executed statement
conn.commit()
conn.close()
rows

[('Ibrahim', 'ib@info.com', 'Ibrahim', 'ib@info.com'),
 ('Mamadou', 'mam@info.com', 'Mamadou', 'mam@info.com'),
 ('Nabintou', 'nab@info.com', 'Nabintou', 'nab@info.com')]