In [77]:
import pymysql
pymysql.install_as_MySQLdb()

In [78]:
import json
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus
from sqlalchemy import text

## Loading in the CSV files

In [79]:
races_path = 'csv/races.csv'
runners_path = 'csv/runners.csv'
training_path = 'csv/training_details.csv'
users_path = 'csv/users.csv'

In [80]:
races = pd.read_csv(races_path)
runners = pd.read_csv(runners_path)
training_details = pd.read_csv(training_path)
users = pd.read_csv(users_path)

In [81]:
races.head

<bound method NDFrame.head of    id           event  winner_id
0   1  100 meter dash        2.0
1   2  500 meter dash        3.0
2   3   cross-country        2.0
3   4      triathalon        NaN>

In [82]:
races.set_index('id',inplace=True)

In [83]:
races

Unnamed: 0_level_0,event,winner_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,100 meter dash,2.0
2,500 meter dash,3.0
3,cross-country,2.0
4,triathalon,


In [84]:
runners.set_index('id',inplace=True)

In [85]:
runners

Unnamed: 0_level_0,event,winner_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,100 meter dash,2.0
2,500 meter dash,3.0
3,cross-country,2.0
4,triathalon,


In [86]:
training_details.set_index('user_training_id',inplace=True)

In [87]:
training_details

Unnamed: 0_level_0,user_id,training_id,training_date
user_training_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,1,8/2/2015
2,2,1,8/3/2015
3,3,2,8/2/2015
4,4,2,8/4/2015
5,2,2,8/3/2015
6,1,1,8/2/2015
7,3,2,8/4/2015
8,4,3,8/3/2015
9,1,4,8/3/2015
10,3,1,8/2/2015


In [88]:
users.set_index('user_id',inplace=True)

In [89]:
users

Unnamed: 0_level_0,username
user_id,Unnamed: 1_level_1
1,John Doe
2,Jane Doe
3,Alice Jones
4,Lisa Romero


## Setting up MySQL Connection

In [90]:
with open('/Users/Admin/.secret/sql_password.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [91]:
username = 'root'
password = login['password']
db_name = 'collinson_test_parkerh'
connection = f'mysql+pymysql://{username}:{password}@localhost/{db_name}'
engine = create_engine(connection)
conn = engine.connect()

In [92]:
q = """SHOW TABLES;"""
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_collinson_test_parkerh
0,races
1,runners
2,training_details
3,users


## Inserting Data Into SQL

In [97]:
races.to_sql("races",conn,index=True, if_exists='replace')

4

In [98]:
runners.to_sql("runners",conn,index=True, if_exists='replace')

4

In [99]:
training_details.to_sql("training_details",conn,index=True, if_exists='replace')

14

In [100]:
users.to_sql("users",conn,index=True, if_exists='replace')

4

## Verifying Data has been input into tables

In [102]:
q = """SELECT *
FROM training_details"""
pd.read_sql(q, conn)

Unnamed: 0,user_training_id,user_id,training_id,training_date
0,1,1,1,8/2/2015
1,2,2,1,8/3/2015
2,3,3,2,8/2/2015
3,4,4,2,8/4/2015
4,5,2,2,8/3/2015
5,6,1,1,8/2/2015
6,7,3,2,8/4/2015
7,8,4,3,8/3/2015
8,9,1,4,8/3/2015
9,10,3,1,8/2/2015


In [112]:
q = """SELECT *
FROM runners"""
pd.read_sql(q, conn)

Unnamed: 0,id,event,winner_id
0,1,100 meter dash,2.0
1,2,500 meter dash,3.0
2,3,cross-country,2.0
3,4,triathalon,


In [113]:
q = """SELECT *
FROM races"""
pd.read_sql(q, conn)

Unnamed: 0,id,event,winner_id
0,1,100 meter dash,2.0
1,2,500 meter dash,3.0
2,3,cross-country,2.0
3,4,triathalon,


# Question 1: Given the below query what will the result be and provide an alternate query to correct it.

In [104]:
q = '''SELECT * FROM runners WHERE id NOT IN (SELECT winner_id FROM races)'''
pd.read_sql(q,conn)

Unnamed: 0,id,event,winner_id


## This query is trying to show the runners who have not won any races. It's looking for rows that don't have an ID in the winner_id column from the races table. The issue is that when it see's the NULL value, it doesn't make the connection of it meaning no winner.

In [111]:
q = '''SELECT * FROM runners WHERE id NOT IN (SELECT winner_id FROM races WHERE winner_id IS NOT NULL);'''
pd.read_sql(q,conn)

Unnamed: 0,id,event,winner_id
0,1,100 meter dash,2.0
1,4,triathalon,


In [106]:
q = '''SELECT * FROM runners 
WHERE id NOT IN (SELECT DISTINCT winner_id FROM races WHERE winner_id IS NOT NULL) 
   OR id NOT IN (SELECT id FROM races WHERE winner_id IS NULL);'''
pd.read_sql(q,conn)

Unnamed: 0,id,event,winner_id
0,1,100 meter dash,2.0
1,2,500 meter dash,3.0
2,3,cross-country,2.0
3,4,triathalon,


In [107]:
q = '''SELECT runners.*
FROM runners
LEFT JOIN races ON runners.id = races.winner_id
WHERE races.id IS NULL;'''
pd.read_sql(q,conn)

Unnamed: 0,id,event,winner_id
0,1,100 meter dash,2.0
1,4,triathalon,


In [108]:
q = '''SELECT *
FROM runners
WHERE NOT EXISTS (SELECT 1 FROM races WHERE winner_id = runners.id);
'''
pd.read_sql(q,conn)

Unnamed: 0,id,event,winner_id
0,1,100 meter dash,2.0
1,4,triathalon,
