# Demo 2.6: *groupby()*: On *Two* Columns   

- **Demonstrates**:  
  - groupby() with Two Columns 




- Data file:  **Cars.csv** 


In [1]:
import pandas as pd

### Read the datafile File into a *pandas* Dataframe  

In [2]:
df = pd.read_csv('Data/Cars.csv')

print(df.shape)
df.head(5)

(428, 13)


Unnamed: 0,Vehicle_Make,Vehicle_Model,Vehicle_Type,Manufacturing_Origin,MPG_City,MPG_Hwy,MSRP,Invoice,Weight,Wheelbase,DriveTrain,EngineSize,Horsepower
0,Acura,MDX,SUV,Asia,17,23,36945,33337,4451,106,All,3.5,265
1,Acura,RSX Type S 2dr,Sedan,Asia,24,31,23820,21761,2778,101,Front,2.0,200
2,Acura,TSX 4dr,Sedan,Asia,22,29,26990,24647,3230,105,Front,2.4,200
3,Acura,TL 4dr,Sedan,Asia,20,28,33195,30299,3575,108,Front,3.2,270
4,Acura,3.5 RL 4dr,Sedan,Asia,18,24,43755,39014,3880,115,Front,3.5,225


# Change Data Vehicle_Types as Needed   
- If we want to do numeric calculations on a column it is important that pandas recognizes it as numeric. 
- We also want to make sure a column is a float (rather than integer) if needed.
- Otherwise either errors or weird results are going to happen!  


In [3]:
# data types 'Before' 
df.dtypes

Vehicle_Make             object
Vehicle_Model            object
Vehicle_Type             object
Manufacturing_Origin     object
MPG_City                  int64
MPG_Hwy                   int64
MSRP                      int64
Invoice                   int64
Weight                    int64
Wheelbase                 int64
DriveTrain               object
EngineSize              float64
Horsepower                int64
dtype: object

In [4]:
# Convert MSRP, Invoice, MPG_City, MPG_Hwy to floats
df['MSRP'] = df['MSRP'].astype(float)
df['Invoice'] = df['Invoice'].astype(float)

df['MPG_City'] = df['MPG_City'].astype(float)
df['MPG_Hwy'] = df['MPG_Hwy'].astype(float)

In [5]:
# data types 'After' 
df.dtypes

Vehicle_Make             object
Vehicle_Model            object
Vehicle_Type             object
Manufacturing_Origin     object
MPG_City                float64
MPG_Hwy                 float64
MSRP                    float64
Invoice                 float64
Weight                    int64
Wheelbase                 int64
DriveTrain               object
EngineSize              float64
Horsepower                int64
dtype: object

# Question:  What is the Average City MPG by Vehicle Vehicle_Type *and* Manufacturing_Origin?  
- Categorical Variable(s) to Group On:  **Vehicle_Type** and **Manufacturing_Origin**   
- Continuous Variable We're Interested In:  **MPG_City** 
- Aggregation Function:  **mean** 


In [6]:
# Optional:  Display the unique values in the column we want to Group on
df['Vehicle_Type'].unique()

array(['SUV', 'Sedan', 'Sports', 'Wagon', 'Truck', 'Hybrid'], dtype=object)

In [7]:
df['Manufacturing_Origin'].unique()

array(['Asia', 'Europe', 'USA'], dtype=object)

In [8]:
df.head(2)

Unnamed: 0,Vehicle_Make,Vehicle_Model,Vehicle_Type,Manufacturing_Origin,MPG_City,MPG_Hwy,MSRP,Invoice,Weight,Wheelbase,DriveTrain,EngineSize,Horsepower
0,Acura,MDX,SUV,Asia,17.0,23.0,36945.0,33337.0,4451,106,All,3.5,265
1,Acura,RSX Type S 2dr,Sedan,Asia,24.0,31.0,23820.0,21761.0,2778,101,Front,2.0,200


# Aggregate on *Two* Columns:  *Vehicle_Type* and *Manufacturing_Origin*     

In [9]:
ser = df.groupby(["Vehicle_Type", 'Manufacturing_Origin'])['MPG_City'].mean()

#print(df_by_type_Manufacturing_Origin.shape)
ser

Vehicle_Type  Manufacturing_Origin
Hybrid        Asia                    55.000000
SUV           Asia                    17.320000
              Europe                  14.500000
              USA                     15.520000
Sedan         Asia                    22.840426
              Europe                  19.512821
              USA                     20.611111
Sports        Asia                    20.235294
              Europe                  17.652174
              USA                     16.888889
Truck         Asia                    17.875000
              USA                     15.812500
Wagon         Asia                    22.363636
              Europe                  19.250000
              USA                     22.285714
Name: MPG_City, dtype: float64

# Convert the pandas ***Series*** to a Dataframe

In [10]:
# First, check that it is a pandas Series
type(ser)

pandas.core.series.Series

In [11]:
# If it is, convert it  to a Dataframe
df = ser.to_frame()

print(df.shape)
df

(15, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,MPG_City
Vehicle_Type,Manufacturing_Origin,Unnamed: 2_level_1
Hybrid,Asia,55.0
SUV,Asia,17.32
SUV,Europe,14.5
SUV,USA,15.52
Sedan,Asia,22.840426
Sedan,Europe,19.512821
Sedan,USA,20.611111
Sports,Asia,20.235294
Sports,Europe,17.652174
Sports,USA,16.888889


# Move the Index Column into the Dataframe  
- Since it is no longer the Index, pandas will create a new default index column with values 0, 1, 2, etc...  

In [12]:
df.reset_index(inplace=True)

print(df.shape)
df

(15, 3)


Unnamed: 0,Vehicle_Type,Manufacturing_Origin,MPG_City
0,Hybrid,Asia,55.0
1,SUV,Asia,17.32
2,SUV,Europe,14.5
3,SUV,USA,15.52
4,Sedan,Asia,22.840426
5,Sedan,Europe,19.512821
6,Sedan,USA,20.611111
7,Sports,Asia,20.235294
8,Sports,Europe,17.652174
9,Sports,USA,16.888889
