In [2]:
import pandas as pd

## Tidy Data

1. Attendance Data

Read the data from the attendance table and calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [9]:
attendance = pd.read_csv('untidy-data/attendance.csv')
attendance = attendance.rename(columns={attendance.columns[0]: 'name'})
attendance

Unnamed: 0,name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [10]:
attendance = attendance.melt(id_vars=['name'], var_name='date', value_name='present')
attendance.head(3)

Unnamed: 0,name,date,present
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A


Change P, A, H, and T to be 1, 0, .5, and .1, respectively.

In [11]:
attendance['present'] = attendance['present'].replace({'P': 1, 'A': 0, 'H':.5, 'T':.9})
attendance.head(3)

Unnamed: 0,name,date,present
0,Sally,2018-01-01,1.0
1,Jane,2018-01-01,0.0
2,Billy,2018-01-01,0.0


Calculate an attendance percentage for each student

In [12]:
attendance.groupby('name')['present'].mean().reset_index()

Unnamed: 0,name,present
0,Billy,0.525
1,Jane,0.6875
2,John,0.9125
3,Sally,0.7625


In [24]:
attendance.groupby('name')['present'].agg(['mean'])

Unnamed: 0_level_0,mean
name,Unnamed: 1_level_1
Billy,0.525
Jane,0.6875
John,0.9125
Sally,0.7625


2. Coffee Levels

In [14]:
# Read the coffee_levels table.
coffee_df = pd.read_csv('untidy-data/coffee_levels.csv')
coffee_df.head(3)

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279


In [15]:
# Transform the data so that each carafe is in it's own column
coffee_df.pivot_table(values='coffee_amount', columns='coffee_carafe', index='hour')

coffee_carafe,x,y,z
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


Is this the best shape for the data?
- It depends what we are trying to answer. If we want each observation to be an hour then yes.

3. Cake Recipes

Read the cake_recipes table. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures

In [16]:
cake_df = pd.read_csv('untidy-data/cake_recipes.csv')
cake_df.head(3)

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541


Tidy the data as necessary.

In [17]:
# Split the 'recipe:position' column
cake_df[['recipe', 'position']] = cake_df['recipe:position'].str.split(':', expand=True)

# Drop the original 'recipe:position' column
cake_df = cake_df.drop('recipe:position', axis=1)
cake_df.head(3)

Unnamed: 0,225,250,275,300,recipe,position
0,61.738655,53.912627,74.41473,98.786784,a,bottom
1,51.709751,52.009735,68.576858,50.22847,a,top
2,57.09532,61.904369,61.19698,99.248541,b,bottom


In [18]:
cake_df = cake_df.melt(id_vars=['recipe','position'], value_name='scores', var_name='temp')
cake_df.head(3)

Unnamed: 0,recipe,position,temp,scores
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532


Which recipe, on average, is the best? **b**

In [19]:
cake_df.groupby('recipe')['scores'].mean().reset_index().sort_values('scores', ascending=False)

Unnamed: 0,recipe,scores
1,b,76.736074
2,c,75.874748
0,a,63.922201
3,d,62.864844


Which oven temperature, on average, produces the best results? **275**

In [20]:
cake_df.groupby('temp')['scores'].mean().reset_index().sort_values('scores', ascending=False)

Unnamed: 0,temp,scores
2,275,74.886754
0,225,71.306022
3,300,66.627655
1,250,66.577437


Which combination of recipe, rack position, and temperature gives the best result?
- **b, bottom, 300**

In [21]:
cake_df.groupby(['recipe','position','temp'])['scores'].mean().reset_index()\
                                                .sort_values('scores', ascending=False)

Unnamed: 0,recipe,position,temp,scores
11,b,bottom,300,99.248541
3,a,bottom,300,98.786784
14,b,top,275,98.594881
28,d,top,225,96.873178
16,c,bottom,225,96.470207
13,b,top,250,95.224151
18,c,bottom,275,92.893227
22,c,top,275,92.098049
21,c,top,250,82.795477
12,b,top,225,82.455004
