In [1]:
import pandas as pd
import numpy as np

np.random.seed(123)

### Exercises I

1. Run `python -m pip install pymysql` from your terminal to install pymysql.

2. `cd` into your exercises folder for this module and run `echo env.py >> .gitignore`

In [2]:
# 3. Create a function named `get_db_url`. It should accept a username,
#     hostname, password, and database name and return a url connection
#     string formatted like in the example at the start of this lesson.
from env import host, user, pwd

def get_db_url(host, user, pwd, db):
    return f'mysql+pymysql://{user}:{pwd}@{host}/{db}'

In [None]:
# 4. Use your function to obtain a connection to the `employees` database.
url = get_db_url(host, user, pwd, 'employees')

In [3]:
# 5. Once you have successfully run a query:

#     a. Intentionally make a typo in the database url.
#     What kind of error message do you see?

# *Operational Error, Access Denied for user*

sql = 'SELECT * FROM employees LIMIT 100'

pd.read_sql(sql, get_db_url(host, user, pwd, 'employes'))

OperationalError: (pymysql.err.OperationalError) (1044, "Access denied for user 'quintela_2238'@'%' to database 'employes'")
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [5]:
#     b. Intentionally make an error in your SQL query.
#     What does the error message look like?

# ProgrammingError, SQL error is given (e.g. syntax, table doesn't exist, etc.)

sql_w_typo = 'SELECT * FROM employees LIMI 100'

pd.read_sql(sql_w_typo, get_db_url(host, user, pwd, 'employees'))

ProgrammingError: (pymysql.err.ProgrammingError) (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '100' at line 1")
[SQL: SELECT * FROM employees LIMI 100]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [6]:
# 6. Read the `employees` and `titles` tables into two separate DataFrames.
sql_e = 'SELECT * FROM employees'
employees_df = pd.read_sql(sql_e, get_db_url( host, user, pwd, 'employees') )
employees_df

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12
...,...,...,...,...,...,...
300019,499995,1958-09-24,Dekang,Lichtner,F,1993-01-12
300020,499996,1953-03-07,Zito,Baaz,M,1990-09-27
300021,499997,1961-08-03,Berhard,Lenart,M,1986-04-21
300022,499998,1956-09-05,Patricia,Breugel,M,1993-10-13


In [7]:
sql_t = 'SELECT * FROM titles'
titles_df = pd.read_sql(sql_t, get_db_url( host, user, pwd, 'employees') )
titles_df

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10002,Staff,1996-08-03,9999-01-01
2,10003,Senior Engineer,1995-12-03,9999-01-01
3,10004,Engineer,1986-12-01,1995-12-01
4,10004,Senior Engineer,1995-12-01,9999-01-01
...,...,...,...,...
443303,499997,Engineer,1987-08-30,1992-08-29
443304,499997,Senior Engineer,1992-08-29,9999-01-01
443305,499998,Senior Staff,1998-12-27,9999-01-01
443306,499998,Staff,1993-12-27,1998-12-27


In [8]:
# 7. How many rows and columns do you have in each DataFrame? Is that what you expected?
print(employees_df.shape)
print(titles_df.shape)

(300024, 6)
(443308, 4)


In [9]:
# 8. Display the summary statistics for each DataFrame.
employees_df.describe()

Unnamed: 0,emp_no
count,300024.0
mean,253321.763392
std,161828.23554
min,10001.0
25%,85006.75
50%,249987.5
75%,424993.25
max,499999.0


In [10]:
titles_df.describe()

Unnamed: 0,emp_no
count,443308.0
mean,253075.03443
std,161853.292613
min,10001.0
25%,84855.75
50%,249847.5
75%,424891.25
max,499999.0


In [11]:
# 9. How many unique titles are in the `titles` DataFrame?
titles_df['title'].unique()

array(['Senior Engineer', 'Staff', 'Engineer', 'Senior Staff',
       'Assistant Engineer', 'Technique Leader', 'Manager'], dtype=object)

In [12]:
# 10. What is the oldest date in the `to_date` column?
titles_df['to_date'].min()


datetime.date(1985, 3, 1)

In [13]:
titles_df[ titles_df['to_date'] == titles_df['to_date'].min() ]

Unnamed: 0,emp_no,title,from_date,to_date
16064,20869,Engineer,1985-02-17,1985-03-01


In [14]:
# 11. What is the most recent date in the `to_date` column?
#     assume not current (i.e. to_date == '9999%')
#     otherwist that would be our most recent days
import datetime

# remove current titles
title_df_not_current = titles_df[ titles_df['to_date'] !=  datetime.date(9999, 1, 1) ]

# most recent date
title_df_not_current['to_date'].max()


datetime.date(2002, 8, 1)

In [15]:
# filter titles_df to show where to_date ==
#     most recent date (i.e. title_df_not_current['to_date'].max())
titles_df[ titles_df['to_date'] == title_df_not_current['to_date'].max()]


Unnamed: 0,emp_no,title,from_date,to_date
15192,20278,Senior Staff,1999-11-04,2002-08-01
17375,21763,Staff,1994-08-01,2002-08-01
23636,26000,Staff,1997-08-01,2002-08-01
67905,55876,Staff,1995-08-01,2002-08-01
80193,64174,Engineer,1996-08-01,2002-08-01
116306,88539,Staff,1995-08-01,2002-08-01
118697,90134,Staff,1994-08-01,2002-08-01
119478,90666,Engineer,1997-08-01,2002-08-01
128288,96599,Engineer,1994-08-01,2002-08-01
135654,101563,Staff,1997-08-01,2002-08-01


### Exercises II



In [16]:
# 1. Copy the `users` and `roles` DataFrames from the examples above.
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})

In [17]:
# 2. What is the result of using a `right` join on the DataFrames?
users.merge(roles, left_on = 'role_id', right_on = 'id', how = 'right')

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1,admin
1,2.0,joe,2.0,2,author
2,3.0,sally,3.0,3,reviewer
3,4.0,adam,3.0,3,reviewer
4,,,,4,commenter


In [18]:
# 3. What is the result of using an `outer` join on the DataFrames?
users.merge(roles, left_on = 'role_id', right_on = 'id', how = 'outer')

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1.0,admin
1,2.0,joe,2.0,2.0,author
2,3.0,sally,3.0,3.0,reviewer
3,4.0,adam,3.0,3.0,reviewer
4,5.0,jane,,,
5,6.0,mike,,,
6,,,,4.0,commenter


In [19]:
# 4. What happens if you drop the foreign keys from the 
#     DataFrames and try to merge them?

# users.drop('role_id', axis=1).merge(roles)

# There won't be columns to merge.. ?.. outside of 'id's' but
# we'd be matching different id's

In [20]:
# 5. Load the `mpg` dataset from PyDataset.
from pydataset import data
mpg = data('mpg')
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [21]:
# 6. Output and read the documentation for the `mpg` dataset.
data('mpg', show_doc=True)

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




In [22]:
# 7. How many rows and columns are in the dataset?
# 234 rows and 11 variables
mpg.shape

(234, 11)

In [23]:
# 8. Check out your column names and perform any
#     cleanup you may want on them.
mpg.columns

Index(['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty',
       'hwy', 'fl', 'class'],
      dtype='object')

In [24]:
mpg = mpg.rename(columns = {'displ':'display','fl':'fuel'})
mpg.head(2)

Unnamed: 0,manufacturer,model,display,year,cyl,trans,drv,cty,hwy,fuel,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact


In [25]:
# 9. Display the summary statistics for the dataset.
mpg.describe()

Unnamed: 0,display,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


In [26]:
# 10. How many different manufacturers are there?
print(list(mpg.manufacturer.unique()))
len(mpg.manufacturer.unique())

['audi', 'chevrolet', 'dodge', 'ford', 'honda', 'hyundai', 'jeep', 'land rover', 'lincoln', 'mercury', 'nissan', 'pontiac', 'subaru', 'toyota', 'volkswagen']


15

In [27]:
# 11. How many different models are there?
print(list(mpg.model.unique()))
len(mpg.model.unique())

['a4', 'a4 quattro', 'a6 quattro', 'c1500 suburban 2wd', 'corvette', 'k1500 tahoe 4wd', 'malibu', 'caravan 2wd', 'dakota pickup 4wd', 'durango 4wd', 'ram 1500 pickup 4wd', 'expedition 2wd', 'explorer 4wd', 'f150 pickup 4wd', 'mustang', 'civic', 'sonata', 'tiburon', 'grand cherokee 4wd', 'range rover', 'navigator 2wd', 'mountaineer 4wd', 'altima', 'maxima', 'pathfinder 4wd', 'grand prix', 'forester awd', 'impreza awd', '4runner 4wd', 'camry', 'camry solara', 'corolla', 'land cruiser wagon 4wd', 'toyota tacoma 4wd', 'gti', 'jetta', 'new beetle', 'passat']


38

In [28]:
# 12. Create a column named `mileage_difference` like you did in 
#     the DataFrames exercises; this column should contain
#     the difference between highway and city mileage for each car.
mpg['mileage_difference'] = mpg['hwy'] - mpg['cty'] 
mpg.head()

Unnamed: 0,manufacturer,model,display,year,cyl,trans,drv,cty,hwy,fuel,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10


In [29]:
# 13. Create a column named `average_mileage`
#     like you did in the DataFrames exercises; 
#     this is the mean of the city and highway mileage.
mpg['average_mileage'] = (mpg['hwy'] + mpg['cty']) / 2
mpg.head()

Unnamed: 0,manufacturer,model,display,year,cyl,trans,drv,cty,hwy,fuel,class,mileage_difference,average_mileage
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11,23.5
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8,25.0
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11,25.5
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9,25.5
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10,21.0


In [30]:
# 14. Create a new column on the mpg dataset named
#     `is_automatic` that holds boolean values denoting whether
#     the car has an automatic transmission.
mpg['is_automatic'] = mpg['trans'].str.find('auto') >= 0
mpg.head()

Unnamed: 0,manufacturer,model,display,year,cyl,trans,drv,cty,hwy,fuel,class,mileage_difference,average_mileage,is_automatic
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11,23.5,True
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8,25.0,False
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11,25.5,False
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9,25.5,True
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10,21.0,True


In [31]:
# 15. Using the `mpg` dataset, find out which manufacturer
#     has the best miles per gallon on average?
mpg.groupby('manufacturer').average_mileage \
                            .mean() \
                            .sort_values(ascending=False).head(1)

manufacturer
honda    28.5
Name: average_mileage, dtype: float64

In [32]:
# 16. Do automatic or manual cars have better miles per gallon?
mpg.groupby('is_automatic').average_mileage \
                            .mean()
# manual has better mpg

is_automatic
False    22.227273
True     19.130573
Name: average_mileage, dtype: float64

### Exercises III

In [4]:
# 1. Use your `get_db_url` function to help you explore the data
#     from the `chipotle` database.
chip_url = get_db_url(host, user, pwd, 'chipotle')
sql = 'SELECT * FROM orders'
orders = pd.read_sql(sql, chip_url)
orders.head()

Unnamed: 0,id,order_id,quantity,item_name,choice_description,item_price
0,1,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,2,1,1,Izze,[Clementine],$3.39
2,3,1,1,Nantucket Nectar,[Apple],$3.39
3,4,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,5,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


If cell below is run twice, an error will be produced. Just run the above on to reload the dataset

In [5]:
# 2. What is the total price for each order?
# convert price to float
orders['item_price'] = orders['item_price'].str[1:].astype(float)
orders.head(8)

Unnamed: 0,id,order_id,quantity,item_name,choice_description,item_price
0,1,1,1,Chips and Fresh Tomato Salsa,,2.39
1,2,1,1,Izze,[Clementine],3.39
2,3,1,1,Nantucket Nectar,[Apple],3.39
3,4,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,5,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
5,6,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98
6,7,3,1,Side of Chips,,1.69
7,8,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75


Seeing if `item_price` is price for only one `item_name`

In [6]:
orders.sort_values(by= 'quantity', ascending=False).head()

Unnamed: 0,id,order_id,quantity,item_name,choice_description,item_price
3598,3599,1443,15,Chips and Fresh Tomato Salsa,,44.25
4152,4153,1660,10,Bottled Water,,15.0
3887,3888,1559,8,Side of Chips,,13.52
3599,3600,1443,7,Bottled Water,,10.5
2441,2442,970,5,Bottled Water,,7.5


`item_price` only represents price for one `item_name`. So before I group by `order_id`, I want to create a `total_item_price` column that is `quantity * item_price` and sum that column rather than `item_price`

In [7]:
orders['total_item_price'] = orders['quantity'] * orders['item_price']
orders.head(5)

Unnamed: 0,id,order_id,quantity,item_name,choice_description,item_price,total_item_price
0,1,1,1,Chips and Fresh Tomato Salsa,,2.39,2.39
1,2,1,1,Izze,[Clementine],3.39,3.39
2,3,1,1,Nantucket Nectar,[Apple],3.39,3.39
3,4,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,2.39
4,5,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,33.96


In [8]:
total_per_order = orders.groupby('order_id').total_item_price.sum()
total_per_order.head()

order_id
1    11.56
2    33.96
3    12.67
4    21.00
5    13.70
Name: total_item_price, dtype: float64

In [18]:
# 3. What are the most popular 3 items?
# sum quantity by item_name, order DESC LIMIT 3
orders.groupby('item_name').quantity.sum().sort_values(ascending=False).head(3)

item_name
Chicken Bowl           761
Chicken Burrito        591
Chips and Guacamole    506
Name: quantity, dtype: int64

In [20]:
# 4. Which item has produced the most revenue?
# Sum total price by item_name, order DESC LIMIT 1
orders.groupby('item_name').total_item_price.sum().sort_values(ascending=False).head(1)

item_name
Chicken Bowl    8044.63
Name: total_item_price, dtype: float64

In [21]:
# 5. Join the `employees` and `titles` DataFrames together.
emp_url = get_db_url(host, user, pwd, 'employees')
emp_sql = 'SELECT * FROM employees'
employees = pd.read_sql(emp_sql, emp_url)
employees.head()

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12


In [22]:
titles_sql = 'SELECT * FROM titles'
titles = pd.read_sql(titles_sql, emp_url)
titles.head()

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10002,Staff,1996-08-03,9999-01-01
2,10003,Senior Engineer,1995-12-03,9999-01-01
3,10004,Engineer,1986-12-01,1995-12-01
4,10004,Senior Engineer,1995-12-01,9999-01-01


Assuming inner join

In [84]:
e_t_df = employees.merge(titles, how='inner',
                left_on='emp_no', right_on='emp_no')
e_t_df.head()

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,title,from_date,to_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26,Senior Engineer,1986-06-26,9999-01-01
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21,Staff,1996-08-03,9999-01-01
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28,Senior Engineer,1995-12-03,9999-01-01
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01,Engineer,1986-12-01,1995-12-01
4,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01,Senior Engineer,1995-12-01,9999-01-01


6. For each title, find the hire date of the employee that was hired most recently with that title.

    plan: 
    1. find where hire date matches title start date (hire_date == from_date). Use only these employees
    1. find most recent hire_date (max) for each title

A

In [85]:
condition = e_t_df['hire_date'] == e_t_df['from_date']

e_t_df = e_t_df[condition]

e_t_df.head(3)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,title,from_date,to_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26,Senior Engineer,1986-06-26,9999-01-01
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01,Engineer,1986-12-01,1995-12-01
6,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12,Staff,1989-09-12,1996-09-12


B. Grouping by title to see most recent hire date

In [86]:
mrhd_by_title = e_t_df.groupby('title').hire_date.max().reset_index()
mrhd_by_title

Unnamed: 0,title,hire_date
0,Assistant Engineer,1999-12-12
1,Engineer,2000-01-23
2,Manager,1985-01-01
3,Senior Engineer,2000-01-01
4,Senior Staff,2000-01-06
5,Staff,2000-01-02
6,Technique Leader,1999-12-15


Use transform to assign each row (employee) the most recent title date for its title

In [87]:
e_t_df.groupby('title')['hire_date'].transform('max').sort_values()

148143    1985-01-01
148179    1985-01-01
148175    1985-01-01
148171    1985-01-01
148161    1985-01-01
             ...    
219520    2000-01-23
219511    2000-01-23
219509    2000-01-23
219556    2000-01-23
443307    2000-01-23
Name: hire_date, Length: 150291, dtype: object

In [88]:
max_hr_dt = e_t_df.groupby('title')['hire_date'].transform('max')
e_t_df['most_recent_hire_date_for_title'] = max_hr_dt
e_t_df.head()

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,title,from_date,to_date,most_recent_hire_date_for_title
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26,Senior Engineer,1986-06-26,9999-01-01,2000-01-01
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01,Engineer,1986-12-01,1995-12-01,2000-01-23
6,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12,Staff,1989-09-12,1996-09-12,2000-01-02
9,10007,1957-05-23,Tzvetan,Zielinski,F,1989-02-10,Staff,1989-02-10,1996-02-11,2000-01-02
11,10009,1952-04-19,Sumant,Peac,F,1985-02-18,Assistant Engineer,1985-02-18,1990-02-18,1999-12-12


get recent hires where `hire_date = most_recent_hire_date_for_title`

In [90]:
recent_hires_df = e_t_df[e_t_df['hire_date'] == e_t_df['most_recent_hire_date_for_title']]
recent_hires_df.sort_values('title')

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,title,from_date,to_date,most_recent_hire_date_for_title
417215,482240,1959-06-08,Guozhong,Renear,F,1999-12-12,Assistant Engineer,1999-12-12,9999-01-01,1999-12-12
337630,428377,1957-05-09,Yucai,Gerlach,M,2000-01-23,Engineer,2000-01-23,9999-01-01,2000-01-23
148135,110022,1956-09-12,Margareta,Markovitch,M,1985-01-01,Manager,1985-01-01,1991-10-01,1985-01-01
148139,110085,1959-10-28,Ebru,Alpin,M,1985-01-01,Manager,1985-01-01,1989-12-17,1985-01-01
148143,110183,1953-06-24,Shirish,Ossenbruggen,F,1985-01-01,Manager,1985-01-01,1992-03-21,1985-01-01
148147,110303,1956-06-08,Krassimir,Wegerle,F,1985-01-01,Manager,1985-01-01,1988-09-09,1985-01-01
148157,110511,1957-07-08,DeForest,Hagimont,M,1985-01-01,Manager,1985-01-01,1992-04-25,1985-01-01
148161,110725,1961-03-14,Peternela,Onuegbe,F,1985-01-01,Manager,1985-01-01,1989-05-06,1985-01-01
148171,111035,1962-02-24,Przemyslawa,Kaelbling,M,1985-01-01,Manager,1985-01-01,1991-03-07,1985-01-01
148175,111400,1959-11-09,Arie,Staelin,M,1985-01-01,Manager,1985-01-01,1991-04-08,1985-01-01


7. Write the code necessary to create a cross tabulation of
    the number of titles by department. (Hint: this will
    involve a combination of SQL code to pull the necessary
    data and python/pandas code to perform the manipulations.)


In [107]:
sql = """
        SELECT *
        FROM departments 
            JOIN dept_emp USING (dept_no)
            JOIN titles USING (emp_no)
        """
url = get_db_url(host, user, pwd, 'employees')

df7 = pd.read_sql(sql, url)
df7

Unnamed: 0,emp_no,dept_no,dept_name,from_date,to_date,title,from_date.1,to_date.1
0,10011,d009,Customer Service,1990-01-22,1996-11-09,Staff,1990-01-22,1996-11-09
1,10038,d009,Customer Service,1989-09-20,9999-01-01,Senior Staff,1996-09-20,9999-01-01
2,10038,d009,Customer Service,1989-09-20,9999-01-01,Staff,1989-09-20,1996-09-20
3,10049,d009,Customer Service,1992-05-04,9999-01-01,Senior Staff,2000-05-04,9999-01-01
4,10049,d009,Customer Service,1992-05-04,9999-01-01,Staff,1992-05-04,2000-05-04
...,...,...,...,...,...,...,...,...
489898,499986,d007,Sales,1985-08-11,9999-01-01,Senior Staff,1992-08-11,9999-01-01
489899,499986,d007,Sales,1985-08-11,9999-01-01,Staff,1985-08-11,1992-08-11
489900,499987,d007,Sales,1999-12-21,9999-01-01,Staff,1999-12-21,9999-01-01
489901,499988,d007,Sales,1988-07-25,2001-10-09,Senior Staff,1997-07-25,2001-10-09


Number of titles (previous and current) by department

In [112]:
pd.crosstab(df7['dept_name'], df7['title'], margins=True)

title,Assistant Engineer,Engineer,Manager,Senior Engineer,Senior Staff,Staff,Technique Leader,All
dept_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Customer Service,298,2362,4,2027,13925,16150,309,35075
Development,7769,58135,2,49326,1247,1424,7683,125586
Finance,0,0,2,0,12139,13929,0,26070
Human Resources,0,0,2,0,12274,14342,0,26618
Marketing,0,0,2,0,13940,16196,0,30138
Production,6445,49649,4,42205,1270,1478,6557,107608
Quality Management,1831,13852,4,11864,0,0,1795,29346
Research,378,2986,2,2570,11637,13495,393,31461
Sales,0,0,2,0,36191,41808,0,78001
All,16721,126984,24,107992,102623,118822,16737,489903
