In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pydataset import data

## Tidy Data Exercises

1. Attendance Data

- a. Load the attendance.csv file and 
- b. calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

2. Coffee Levels

- a. Read the coffee_levels.csv file.
- b. Transform the data so that each carafe is in it's own column.
- c. Is this the best shape for the data?

3. Cake Recipes

- a. Read the cake_recipes.csv data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.
- b. Tidy the data as necessary.
- c. Which recipe, on average, is the best? recipe b
- d. Which oven temperature, on average, produces the best results? 275
- e. Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

#### 1a. Load the attendance.csv file

In [2]:
attendance = pd.read_csv('untidy-data/attendance.csv')

In [None]:
attendance.head()

In [None]:
sales.shape

In [None]:
attendance.info()

In [3]:
#melt data
attendance_melt= attendance.melt(id_vars='Unnamed: 0', var_name='date',value_name='attendance')
attendance_melt.head()

Unnamed: 0.1,Unnamed: 0,date,attendance
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A
3,John,2018-01-01,P
4,Sally,2018-01-02,T


In [4]:
#rename column
attendance_melt = attendance_melt.rename(columns={'Unnamed: 0': 'name'})
attendance_melt.head()

Unnamed: 0,name,date,attendance
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A
3,John,2018-01-01,P
4,Sally,2018-01-02,T


#### 1b. calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [None]:
#talk it out...
#p=1 = 100%
#a=0 = 0% absent
#t=.9 = 90% tardy/10=100, 1=.1
#h=.5 = 50% half day

In [5]:
#create function to turn letter grade into numeric grade
def attendance_percent(letter):
    if letter == 'P':
        return 1
    elif letter == 'A':
        return 0
    elif letter == 'T':
        return 0.9
    elif letter == 'H':
        return 0.5

In [6]:
#correct code to apply function: df['column']=df['column'].apply(function)
#turn letter grade into numeric
attendance_melt['attendance']=attendance_melt['attendance'].apply(attendance_percent)

In [8]:
attendance_melt.head()

Unnamed: 0,name,date,attendance
0,Sally,2018-01-01,1.0
1,Jane,2018-01-01,0.0
2,Billy,2018-01-01,0.0
3,John,2018-01-01,1.0
4,Sally,2018-01-02,0.9


In [11]:
#group by the name of the student to get average for each person
attendance_melt.groupby('name').attendance.mean()

name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: attendance, dtype: float64

_________________________

#### 2a. Read the coffee_levels.csv file.

In [12]:
coffee = pd.read_csv('untidy-data/coffee_levels.csv')

In [15]:
#get to know data
coffee

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291
5,13,x,0.310711
6,14,x,0.507288
7,15,x,0.215043
8,16,x,0.183891
9,17,x,0.39156


#### 2b. Transform the data so that each carafe is in it's own column.

In [17]:
#pivot table
coffee.pivot_table(index=['hour'], columns='coffee_carafe', values = 'coffee_amount')

coffee_carafe,x,y,z
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


#### 2c. Is this the best shape for the data?

In [None]:
#yes. easier to read

_____________________

#### 3a. Read the cake_recipes.csv data. 

In [59]:
cake = pd.read_csv('untidy-data/cake_recipes.csv')

In [60]:
cake

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084
5,c:top,71.306308,82.795477,92.098049,53.960273
6,d:bottom,52.799753,58.670419,51.747686,56.18311
7,d:top,96.873178,76.101363,59.57162,50.971626


In [61]:
cake.shape

(8, 5)

#### 3b. Tidy the data as necessary.

In [62]:
#rename recipe:position column
cake = cake.rename(columns={'recipe:position': 'delete'})
cake

Unnamed: 0,delete,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084
5,c:top,71.306308,82.795477,92.098049,53.960273
6,d:bottom,52.799753,58.670419,51.747686,56.18311
7,d:top,96.873178,76.101363,59.57162,50.971626


In [63]:
cake.delete.str.split(':')

0    [a, bottom]
1       [a, top]
2    [b, bottom]
3       [b, top]
4    [c, bottom]
5       [c, top]
6    [d, bottom]
7       [d, top]
Name: delete, dtype: object

In [64]:
#split on : for recipe:position and create 2 new columns
#delete original column
cake[['recipe', 'position']] = cake.delete.str.split(':', expand = True)
cake

Unnamed: 0,delete,225,250,275,300,recipe,position
0,a:bottom,61.738655,53.912627,74.41473,98.786784,a,bottom
1,a:top,51.709751,52.009735,68.576858,50.22847,a,top
2,b:bottom,57.09532,61.904369,61.19698,99.248541,b,bottom
3,b:top,82.455004,95.224151,98.594881,58.169349,b,top
4,c:bottom,96.470207,52.001358,92.893227,65.473084,c,bottom
5,c:top,71.306308,82.795477,92.098049,53.960273,c,top
6,d:bottom,52.799753,58.670419,51.747686,56.18311,d,bottom
7,d:top,96.873178,76.101363,59.57162,50.971626,d,top


In [65]:
#remove delete column
cake = cake.drop(columns='delete')
cake

Unnamed: 0,225,250,275,300,recipe,position
0,61.738655,53.912627,74.41473,98.786784,a,bottom
1,51.709751,52.009735,68.576858,50.22847,a,top
2,57.09532,61.904369,61.19698,99.248541,b,bottom
3,82.455004,95.224151,98.594881,58.169349,b,top
4,96.470207,52.001358,92.893227,65.473084,c,bottom
5,71.306308,82.795477,92.098049,53.960273,c,top
6,52.799753,58.670419,51.747686,56.18311,d,bottom
7,96.873178,76.101363,59.57162,50.971626,d,top


In [70]:
#melt
cake_melt = cake.melt(id_vars=['recipe', 'position'], var_name='temp', value_name='score')
cake_melt

Unnamed: 0,recipe,position,temp,score
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207
5,c,top,225,71.306308
6,d,bottom,225,52.799753
7,d,top,225,96.873178
8,a,bottom,250,53.912627
9,a,top,250,52.009735


#### 3c. Which recipe, on average, is the best? recipe b

In [66]:
cake.mean(axis=1)
#b top

0    72.213199
1    55.631204
2    69.861302
3    83.610846
4    76.709469
5    75.040027
6    54.850242
7    70.879447
dtype: float64

In [71]:
#another way, using cake_melt
cake_melt.groupby('recipe').score.mean()
#answer:recipe b

recipe
a    63.922201
b    76.736074
c    75.874748
d    62.864844
Name: score, dtype: float64

#### 3d. Which oven temperature, on average, produces the best results? 275

In [67]:
cake.mean(axis=0)
#275

225    71.306022
250    66.577437
275    74.886754
300    66.627655
dtype: float64

In [72]:
#another way, using cake_melt
cake_melt.groupby('temp').score.mean()

temp
225    71.306022
250    66.577437
275    74.886754
300    66.627655
Name: score, dtype: float64

#### 3e. Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [77]:
#another way, using cake_melt
cake_melt['score'].max()

99.2485405378462

In [79]:
cake_melt.sort_values(by='score').max()

recipe            d
position        top
temp            300
score       99.2485
dtype: object

In [81]:
cake_melt.sort_values(by='score').tail()

Unnamed: 0,recipe,position,temp,score
4,c,bottom,225,96.470207
7,d,top,225,96.873178
19,b,top,275,98.594881
24,a,bottom,300,98.786784
26,b,bottom,300,99.248541
