# Advanced pandas for Data Analysis

## Agenda

 - Lab 1: Groupby
 - Lab 2: Data Joins

## Data I/O

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('resources/cars.csv')

## Lab 1: Groupby

 - Determine the average `price` grouped by `make`
 - Determine the average and standard deviation of `price` grouped by `make`
 - Determine the median `price` and average `engine_size` grouped by `make`
 - Create a variable `mpg_bins`, that cuts the `city_mpg` column into four groups:
     - 10 - 20
     - 20 - 30
     - 30 - 40
     - 40 - 50
     - Name these groups 'low','medium','high','very_high'
 - Determine the average `price` grouped by `mpg_bins`
 

In [36]:
# Determine the average price grouped by make
df.groupby('make').mean()[['price']].sort_values(by='price')

Unnamed: 0_level_0,price
make,Unnamed: 1_level_1
chevrolet,6007.0
dodge,7875.444444
plymouth,7963.428571
honda,8184.692308
subaru,8541.25
isuzu,8916.5
mitsubishi,9239.769231
renault,9595.0
toyota,9885.8125
volkswagen,10077.5


In [28]:
# Same as solution above except using pivot table
df.pivot_table(index = 'make',values = 'price',aggfunc = 'mean')

Unnamed: 0_level_0,price
make,Unnamed: 1_level_1
alfa-romero,15498.333333
audi,17859.166667
bmw,26118.75
chevrolet,6007.0
dodge,7875.444444
honda,8184.692308
isuzu,8916.5
jaguar,34600.0
mazda,10652.882353
mercedes-benz,33647.0


In [29]:
# Determine the average and standard deviation of price grouped by make
df.groupby('make').aggregate(['mean', 'std'])['price'].sort_values(by='mean')

Unnamed: 0_level_0,mean,std
make,Unnamed: 1_level_1,Unnamed: 2_level_1
chevrolet,6007.0,754.421633
dodge,7875.444444,2213.386044
plymouth,7963.428571,2395.544257
honda,8184.692308,2061.672112
subaru,8541.25,1940.191468
isuzu,8916.5,3014.396208
mitsubishi,9239.769231,3042.99869
renault,9595.0,424.264069
toyota,9885.8125,3204.982114
volkswagen,10077.5,2178.549872


In [30]:
# Determine the median price and average engine_size grouped by make
df.groupby('make').agg({'price':'median','engine_size':'mean'}).sort_values(by='price').rename(columns = {'price':'median_price'})

Unnamed: 0_level_0,median_price,engine_size
make,Unnamed: 1_level_1,Unnamed: 2_level_1
chevrolet,6295.0,80.333333
honda,7295.0,99.307692
dodge,7609.0,102.666667
plymouth,7609.0,106.285714
subaru,7894.0,107.083333
nissan,8124.0,127.888889
mitsubishi,8499.0,118.307692
isuzu,8916.5,102.5
toyota,9103.0,118.8125
renault,9595.0,132.0


In [31]:
# Create a variable `mpg_bins`, that cuts the `city_mpg` column into four groups
df['mpg_bins'] = pd.cut(df['city_mpg'],
                        bins = [i for i in range(10,51,10)],
                        labels = ['low','medium','high','very_high'])

# Find average price by bin
df.groupby('mpg_bins').mean()[['price']]

Unnamed: 0_level_0,price
mpg_bins,Unnamed: 1_level_1
low,21169.285714
medium,11502.979592
high,7343.909091
very_high,6243.0


## Lab 2: Data Joins

 - Create a dataframe `make_grouped_df` which describes the median `city_mpg` and `highway_mpg` grouped by `make`
 - Join `df` and `make_grouped_df` on `make`. Use a logical suffix for this join.
 - Create a dataframe `fancy_stuff` from a subset of the `make_grouped_df` that including the `make` 'mercedes-benz', 'porsche', 'jaguar'
 - Do a right or left join between `df` and `fancy_stuff` on `make` so that all values still show for `df`.Use a logical suffix for this join.

In [32]:
# Create a dataframe `make_grouped_df` 
# which describes the median `city_mpg` and `highway_mpg` grouped by `make`
make_grouped_df = df[['make','city_mpg','highway_mpg']].groupby('make',as_index=False).median()
make_grouped_df.sort_values(by='city_mpg')

Unnamed: 0,make,city_mpg,highway_mpg
7,jaguar,15.0,19.0
15,porsche,17.0,25.0
10,mercury,19.0,24.0
1,audi,19.0,25.0
13,peugot,19.0,24.0
9,mercedes-benz,19.0,21.5
2,bmw,20.5,26.5
17,saab,21.0,28.0
0,alfa-romero,21.0,27.0
16,renault,23.0,31.0


In [33]:
# Join `df` and `make_grouped_df` on `make`. Use a logical suffix for this join.

# Inner join is default
pd.merge(df, make_grouped_df, suffixes=('', '_make_median'),on='make').head()


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,drive_wheels,engine_location,wheel_base,length,...,highway_mpg,price,convertible,hardtop,hatchback,sedan,wagon,mpg_bins,city_mpg_make_median,highway_mpg_make_median
0,3,,alfa-romero,gas,std,two,rwd,front,88.6,168.8,...,27,13495.0,1,0,0,0,0,medium,21.0,27.0
1,1,,alfa-romero,gas,std,two,rwd,front,94.5,171.2,...,26,16500.0,0,0,1,0,0,low,21.0,27.0
2,3,,alfa-romero,gas,std,two,rwd,front,88.6,168.8,...,27,16500.0,1,0,0,0,0,medium,21.0,27.0
3,2,164.0,audi,gas,std,four,fwd,front,99.8,176.6,...,30,13950.0,0,0,0,1,0,medium,19.0,25.0
4,2,164.0,audi,gas,std,four,4wd,front,99.4,176.6,...,22,17450.0,0,0,0,1,0,low,19.0,25.0


In [34]:
# Create a dataframe fancy_stuff from a subset of the make_grouped_df that 
# including the make 'mercedes-benz', 'porsche', 'jaguar'
# Do a right or left join between df and fancy_stuff on make 
# so that all values still show for df.Use a logical suffix for this join.

fancy_stuff = make_grouped_df[make_grouped_df['make'].isin(['mercedes-benz', 'porsche', 'jaguar'])]
df.merge(fancy_stuff,how='left',on='make',suffixes=('','_make_median')).sample(10,random_state = 4)

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,drive_wheels,engine_location,wheel_base,length,...,highway_mpg,price,convertible,hardtop,hatchback,sedan,wagon,mpg_bins,city_mpg_make_median,highway_mpg_make_median
1,2,164.0,audi,gas,std,four,fwd,front,99.8,176.6,...,30,13950.0,0,0,0,1,0,medium,,
82,0,108.0,nissan,gas,std,four,fwd,front,100.4,184.6,...,22,14399.0,0,0,0,0,1,low,,
76,1,128.0,nissan,gas,std,two,fwd,front,94.5,165.6,...,37,7799.0,0,0,1,0,0,high,,
102,3,,porsche,gas,std,two,rwd,rear,89.5,168.9,...,25,32528.0,0,1,0,0,0,low,17.0,25.0
71,1,125.0,mitsubishi,gas,turbo,four,fwd,front,96.3,172.4,...,30,9279.0,0,0,0,1,0,medium,,
185,0,,peugot,gas,std,four,rwd,front,114.2,198.9,...,24,12440.0,0,0,0,0,1,low,,
105,1,,porsche,gas,std,two,rwd,front,98.4,175.7,...,28,,0,0,1,0,0,low,17.0,25.0
191,3,197.0,toyota,gas,std,two,rwd,front,102.9,183.5,...,24,16558.0,0,0,1,0,0,low,,
134,1,168.0,toyota,gas,std,two,rwd,front,94.5,168.7,...,34,8058.0,0,0,0,1,0,medium,,
14,2,121.0,chevrolet,gas,std,two,fwd,front,88.4,141.1,...,53,5151.0,0,0,1,0,0,very_high,,
