In [1]:
import numpy as np
import pandas as pd

from env import get_db_url

1. **Attendance Data**

Read the data from the attendance table and calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

You should end up with something like this:
```
name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: grade, dtype: float64

```

In [32]:
url = get_db_url('tidy_data')

In [4]:
sql_att = 'SELECT * FROM attendance'

In [33]:
att_df = pd.read_sql(sql_att, url)
att_df

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [11]:
att_df.rename(columns={'Unnamed: 0' : 'student_name'}, inplace = True)

|Letter|meaning|Value|
|---|---|---|
**P**|present|1.0|
**H**|half day|0.5|
**T**|tardie|0.1|
**A**|absence|0.0|

In [14]:
#replace letters with their values
att_df.replace({'A': 0, 'T': 0.1, 'H': 0.5, 'P': 1}, inplace = True)
att_df

Unnamed: 0,student_name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,1,0.1,0.1,0.5,1.0,0.0,0.1,0.1
1,Jane,0,1.0,0.1,0.1,0.1,0.1,0.0,0.1
2,Billy,0,0.1,0.0,0.0,0.5,0.1,1.0,0.1
3,John,1,0.1,0.5,1.0,1.0,0.1,1.0,1.0


In [28]:
att_melt = att_df.melt(id_vars = 'student_name', var_name = 'date', value_name = 'score')
att_melt.head()

Unnamed: 0,student_name,date,score
0,Sally,2018-01-01,1.0
1,Jane,2018-01-01,0.0
2,Billy,2018-01-01,0.0
3,John,2018-01-01,1.0
4,Sally,2018-01-02,0.1


In [31]:
att_melt.groupby('student_name').score.mean()

student_name
Billy    0.2250
Jane     0.1875
John     0.7125
Sally    0.3625
Name: score, dtype: float64

2. **Coffee Levels**

- a. Read the ```coffee_levels``` table.
- b. Transform the data so that each carafe is in it's own column.
- c. Is this the best shape for the data?

In [34]:
sql_coffee = 'SELECT * FROM coffee_levels'

In [36]:
coffee_df = pd.read_sql(sql_coffee, url)

In [37]:
coffee_df

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291
5,13,x,0.310711
6,14,x,0.507288
7,15,x,0.215043
8,16,x,0.183891
9,17,x,0.39156


In [135]:
coffee_pivot = coffee_df.pivot_table(index = 'hour', values = 'coffee_amount', columns = 'coffee_carafe').reset_index().rename_axis(None, axis = 1)
coffee_pivot

Unnamed: 0,hour,x,y,z
0,8,0.816164,0.189297,0.999264
1,9,0.451018,0.521502,0.91599
2,10,0.843279,0.023163,0.144928
3,11,0.335533,0.235529,0.311495
4,12,0.898291,0.017009,0.771947
5,13,0.310711,0.997464,0.39852
6,14,0.507288,0.058361,0.864464
7,15,0.215043,0.144644,0.436364
8,16,0.183891,0.544676,0.280621
9,17,0.39156,0.594126,0.436677


3. **Cake Recipes**

- a. Read the ```cake_recipes``` table. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.
- b. Tidy the data as necessary.
- c. Which recipe, on average, is the best? recipe b
- d. Which oven temperature, on average, produces the best results? 275
- e. Which combination of recipe, rack position, and tempe

In [62]:
sql_cake = 'SELECT * FROM cake_recipes'

In [64]:
cake_df = pd.read_sql(sql_cake, url)

In [65]:
cake_df

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084
5,c:top,71.306308,82.795477,92.098049,53.960273
6,d:bottom,52.799753,58.670419,51.747686,56.18311
7,d:top,96.873178,76.101363,59.57162,50.971626


In [68]:
#split the recipe:position column
cake_df[['recipe','rack_position']] = cake_df['recipe:position'].str.split(':', expand = True)

In [71]:
cake_df.drop(columns = 'recipe:position', inplace = True)

In [85]:
cake_df.head(1)

Unnamed: 0,recipe,rack_position,225,250,275,300
0,a,bottom,61.738655,53.912627,74.41473,98.786784


In [82]:
cake_df = cake_df.reindex(columns = ['recipe', 'rack_position', '225', '250', '275', '300'])

In [84]:
cake_df

Unnamed: 0,recipe,rack_position,225,250,275,300
0,a,bottom,61.738655,53.912627,74.41473,98.786784
1,a,top,51.709751,52.009735,68.576858,50.22847
2,b,bottom,57.09532,61.904369,61.19698,99.248541
3,b,top,82.455004,95.224151,98.594881,58.169349
4,c,bottom,96.470207,52.001358,92.893227,65.473084
5,c,top,71.306308,82.795477,92.098049,53.960273
6,d,bottom,52.799753,58.670419,51.747686,56.18311
7,d,top,96.873178,76.101363,59.57162,50.971626


In [93]:
#Which recipe, on average, is the best? - b
cake_df.pivot_table(index = 'recipe', aggfunc=np.mean).mean(axis=1).idxmax()

'b'

In [101]:
#Which oven temperature, on average, produces the best results? - 275
cake_melt = cake_df.melt(id_vars = ['recipe', 'rack_position'], var_name = 'temperature', value_name = 'taste_score')

In [102]:
cake_melt.head()

Unnamed: 0,recipe,rack_position,temperature,taste_score
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207


In [104]:
cake_melt.groupby('temperature').taste_score.mean().idxmax()

'275'

In [None]:
#Which combination of recipe, rack position, and temperature 
#gives the best result? 
#answer: recipe b, bottom rack, 300 degrees

In [115]:
best_combo = cake_melt.iloc[cake_melt.taste_score.idxmax()]
best_combo

recipe                   b
rack_position       bottom
temperature            300
taste_score      99.248541
Name: 26, dtype: object

In [133]:
print('Best results\n')
for i in range(best_combo.size - 1):
    print(f'Best {best_combo.index[i]:20} {best_combo[i]:2}')

Best results

Best recipe               b 
Best rack_position        bottom
Best temperature          300
