In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import acquire
import prepare

### 1. Attendance Data

Read the data from the attendance table and calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [3]:
attendance = pd.read_sql('''SELECT * FROM attendance;''', acquire.get_db_url('tidy_data'))

In [4]:
attendance

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [7]:
attendance.rename(columns={'Unnamed: 0':'name'}, inplace=True)

In [8]:
attendance.head()

Unnamed: 0,name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [10]:
attendance = attendance.melt(id_vars='name', var_name='date', value_name='attendance')

In [12]:
attendance.head()

Unnamed: 0,name,date,attendance
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A
3,John,2018-01-01,P
4,Sally,2018-01-02,T


In [13]:
# replace data values of attendance with floats relating to attendance
attendance['attendance'].replace({'A':0,'P':1,'T':.9,'H':0.5}, inplace = True)



In [15]:
attendance.groupby('name').mean()

Unnamed: 0_level_0,attendance
name,Unnamed: 1_level_1
Billy,0.525
Jane,0.6875
John,0.9125
Sally,0.7625


### 2. Coffee Levels

a. Read the coffee_levels table.

In [7]:
coffee_levels = pd.read_sql('''SELECT * 
                            FROM coffee_levels;
                            ''', acquire.get_db_url('tidy_data'))

In [8]:
coffee_levels

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291
5,13,x,0.310711
6,14,x,0.507288
7,15,x,0.215043
8,16,x,0.183891
9,17,x,0.39156


b. Transform the data so that each carafe is in it's own column.

In [9]:
coffee_levels.pivot_table(values = 'coffee_amount', 
                          index='hour', 
                          columns='coffee_carafe')

coffee_carafe,x,y,z
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


c. Is this the best shape for the data?

***No, the original data was better.***

### 3. Cake Recipes

a. Read the cake_recipes table. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.

In [25]:
cake_recipes = pd.read_sql('''SELECT * FROM cake_recipes;''', acquire.get_db_url('tidy_data'))

In [26]:
cake_recipes

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084
5,c:top,71.306308,82.795477,92.098049,53.960273
6,d:bottom,52.799753,58.670419,51.747686,56.18311
7,d:top,96.873178,76.101363,59.57162,50.971626


b. Tidy the data as necessary.

In [27]:
cake_melt = cake_recipes.melt(id_vars='recipe:position',
                 var_name='temp',
                 value_name='rate')

In [29]:
cake_melt.head()

Unnamed: 0,recipe:position,temp,rate
0,a:bottom,225,61.738655
1,a:top,225,51.709751
2,b:bottom,225,57.09532
3,b:top,225,82.455004
4,c:bottom,225,96.470207


In [34]:
cake_melt[['recipe','position']] = cake_melt.recipe:position.str.split(':', expand= True)

In [35]:
cake_melt.drop(columns='recipe:position', inplace=True)

In [37]:
cake_melt.head()

Unnamed: 0,temp,rate,recipe,position
0,225,61.738655,a,bottom
1,225,51.709751,a,top
2,225,57.09532,b,bottom
3,225,82.455004,b,top
4,225,96.470207,c,bottom


c. Which recipe, on average, is the best? ***recipe b***

In [38]:
cake_melt.groupby('recipe').mean()

Unnamed: 0_level_0,rate
recipe,Unnamed: 1_level_1
a,63.922201
b,76.736074
c,75.874748
d,62.864844


d. Which oven temperature, on average, produces the best results? ***275***

In [52]:
cake_melt.groupby('temp').mean()

Unnamed: 0_level_0,rate
temp,Unnamed: 1_level_1
225,71.306022
250,66.577437
275,74.886754
300,66.627655


e. Which combination of recipe, rack position, and temperature gives the best result? ***recipe b, bottom rack, 300 degrees***

In [50]:
cake_melt.groupby(['recipe', 'position', 'temp']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rate
recipe,position,temp,Unnamed: 3_level_1
a,bottom,225,61.738655
a,bottom,250,53.912627
a,bottom,275,74.41473
a,bottom,300,98.786784
a,top,225,51.709751
a,top,250,52.009735
a,top,275,68.576858
a,top,300,50.22847
b,bottom,225,57.09532
b,bottom,250,61.904369


In [51]:
cake_melt.groupby(['recipe', 'position', 'temp']).mean().idxmax()

rate    (b, bottom, 300)
dtype: object

## Bonus

### Billboard

In [53]:
billboard = pd.read_sql('''
                            SELECT * 
                            FROM billboard;
                            ''', acquire.get_db_url('tidy_data'))

In [54]:
billboard.head()

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,


In [55]:
billboard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 81 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          317 non-null    int64  
 1   artist        317 non-null    object 
 2   track         317 non-null    object 
 3   time          317 non-null    object 
 4   date.entered  317 non-null    object 
 5   wk1           317 non-null    int64  
 6   wk2           312 non-null    float64
 7   wk3           307 non-null    float64
 8   wk4           300 non-null    float64
 9   wk5           292 non-null    float64
 10  wk6           280 non-null    float64
 11  wk7           269 non-null    float64
 12  wk8           260 non-null    float64
 13  wk9           253 non-null    float64
 14  wk10          244 non-null    float64
 15  wk11          236 non-null    float64
 16  wk12          222 non-null    float64
 17  wk13          210 non-null    float64
 18  wk14          204 non-null    

In [58]:
billboard.columns = billboard.columns[:5].tolist() + list(range(1,77))

In [59]:
billboard

Unnamed: 0,year,artist,track,time,date.entered,1,2,3,4,5,...,67,68,69,70,71,72,73,74,75,76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,86,83.0,77.0,74.0,83.0,...,,,,,,,,,,
313,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,85,83.0,83.0,82.0,81.0,...,,,,,,,,,,
314,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,95,94.0,91.0,85.0,84.0,...,,,,,,,,,,
315,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,99,99.0,,,,...,,,,,,,,,,


In [62]:
billboard_melt = billboard.melt(id_vars=billboard.columns[:5],
                               var_name='week',
                               value_name='rating')

billboard_melt.head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,1,87
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,1,91
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,1,81
3,2000,3 Doors Down,Loser,4:24,2000-10-21,1,76
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,1,57


In [66]:
billboard_melt.groupby('artist').rating.mean().idxmax()

'Fragma'

### Gapminder1

In [68]:
gapminder1 = pd.read_sql('''
                            SELECT * 
                            FROM gapminder1;
                            ''', acquire.get_db_url('tidy_data'))

In [69]:
gapminder1.head()

Unnamed: 0,year,country,measure,measurement
0,1955,Afghanistan,pop,8891209.0
1,1960,Afghanistan,pop,9829450.0
2,1965,Afghanistan,pop,10997885.0
3,1970,Afghanistan,pop,12430623.0
4,1975,Afghanistan,pop,14132019.0


In [70]:
gapminder1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         2079 non-null   int64  
 1   country      2079 non-null   object 
 2   measure      2079 non-null   object 
 3   measurement  2079 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 65.1+ KB


In [72]:
gapminder1.measure.unique()

array(['pop', 'life_expect', 'fertility'], dtype=object)

In [75]:
gapminder1.pivot_table(values='measurement',
                      index=['country','year'],
                      columns='measure').reset_index()

measure,country,year,fertility,life_expect,pop
0,Afghanistan,1955,7.7000,30.332,8891209.0
1,Afghanistan,1960,7.7000,31.997,9829450.0
2,Afghanistan,1965,7.7000,34.020,10997885.0
3,Afghanistan,1970,7.7000,36.088,12430623.0
4,Afghanistan,1975,7.7000,38.438,14132019.0
...,...,...,...,...,...
688,Venezuela,1985,3.6485,70.190,16997509.0
689,Venezuela,1990,3.2500,71.150,19325222.0
690,Venezuela,1995,2.9415,72.146,21555902.0
691,Venezuela,2000,2.7230,72.766,23542649.0


### Gapminder2

In [76]:
gapminder2 = pd.read_sql('''
                            SELECT * 
                            FROM gapminder2;
                            ''', acquire.get_db_url('tidy_data'))

In [77]:
gapminder2.head()

Unnamed: 0,country,life_expect_1955,life_expect_1960,life_expect_1965,life_expect_1970,life_expect_1975,life_expect_1980,life_expect_1985,life_expect_1990,life_expect_1995,...,pop_1960,pop_1965,pop_1970,pop_1975,pop_1980,pop_1985,pop_1990,pop_1995,pop_2000,pop_2005
0,Afghanistan,30.332,31.997,34.02,36.088,38.438,39.854,40.822,41.674,41.763,...,9829450,10997885,12430623,14132019,15112149,13796928,14669339,20881480,23898198,29928987
1,Argentina,64.399,65.142,65.634,67.065,68.481,69.942,70.774,71.868,73.275,...,20616009,22283100,23962313,26081880,28369799,30675059,33022202,35311049,37497728,39537943
2,Aruba,64.381,66.606,68.336,70.941,71.83,74.116,74.494,74.108,73.011,...,57203,59020,59039,59390,60266,64129,66653,67836,69539,71566
3,Australia,70.33,70.93,71.1,71.93,73.49,74.74,76.32,77.56,78.83,...,10361273,11439384,12660160,13771400,14615900,15788300,17022133,18116171,19164620,20090437
4,Austria,67.48,69.54,70.14,70.63,72.17,73.18,74.94,76.04,77.51,...,7047437,7270889,7467086,7578903,7549433,7559776,7722953,8047433,8113413,8184691


In [80]:
gapminder2_melt = gapminder2.melt(id_vars='country',
                                 var_name='measure_year',
                                 value_name='measurement')

gapminder2_melt

Unnamed: 0,country,measure_year,measurement
0,Afghanistan,life_expect_1955,3.033200e+01
1,Argentina,life_expect_1955,6.439900e+01
2,Aruba,life_expect_1955,6.438100e+01
3,Australia,life_expect_1955,7.033000e+01
4,Austria,life_expect_1955,6.748000e+01
...,...,...,...
1381,Switzerland,pop_2005,7.489370e+06
1382,Turkey,pop_2005,6.966056e+07
1383,United Kingdom,pop_2005,6.044146e+07
1384,United States,pop_2005,2.957341e+08


In [90]:
gapminder2_melt.measure_year.str.rsplit("_", n =1, expand= True)

Unnamed: 0,0,1
0,life_expect,1955
1,life_expect,1955
2,life_expect,1955
3,life_expect,1955
4,life_expect,1955
...,...,...
1381,pop,2005
1382,pop,2005
1383,pop,2005
1384,pop,2005


In [91]:
gapminder2_melt[['measure','year']] = gapminder2_melt.measure_year.str.rsplit("_", n =1, expand= True)

In [92]:
gapminder2_melt.drop(columns='measure_year')

Unnamed: 0,country,measurement,measure,year
0,Afghanistan,3.033200e+01,life_expect,1955
1,Argentina,6.439900e+01,life_expect,1955
2,Aruba,6.438100e+01,life_expect,1955
3,Australia,7.033000e+01,life_expect,1955
4,Austria,6.748000e+01,life_expect,1955
...,...,...,...,...
1381,Switzerland,7.489370e+06,pop,2005
1382,Turkey,6.966056e+07,pop,2005
1383,United Kingdom,6.044146e+07,pop,2005
1384,United States,2.957341e+08,pop,2005


In [94]:
gapminder2_melt.pivot_table(values='measurement',
                      index=['country','year'],
                      columns='measure').reset_index()

measure,country,year,life_expect,pop
0,Afghanistan,1955,30.332,8891209.0
1,Afghanistan,1960,31.997,9829450.0
2,Afghanistan,1965,34.020,10997885.0
3,Afghanistan,1970,36.088,12430623.0
4,Afghanistan,1975,38.438,14132019.0
...,...,...,...,...
688,Venezuela,1985,70.190,16997509.0
689,Venezuela,1990,71.150,19325222.0
690,Venezuela,1995,72.146,21555902.0
691,Venezuela,2000,72.766,23542649.0
