# Pandas: grouping

In [1]:
import pandas as pd
import numpy as np

In [5]:
import os
os.getcwd()

'/Users/marcelzhang/Documents/GitHub/IronMarcel'

In [2]:
cars = pd.read_csv("data/vehicles.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/vehicles.csv'

In [None]:
cars.head()

How many Car models? 

In [None]:
cars['Model'].value_counts().shape[0]

In [None]:
len(cars['Model'].unique())

In [None]:
cars['Model'].describe()[1]

group by the data by the Make  using count function

In [None]:
cars.groupby('Make')['Make'].count()

Converting Grams/Mile to Grams/Km

1 Mile = 1.60934 Km

Converting Gallons to Liters

1 Gallon = 3.78541 Liters

What brand has the most cars?

In [None]:
cars.groupby('Make')['Make'].count().sort_values(ascending = False)

<b>show the average CO2_Emission_Grams/Km  by Brand

In [None]:
# Remane the 'CO2 Emission Grams/Mile' column and convert the unit
cars = cars.rename(columns = {'CO2 Emission Grams/Mile': 'CO2 Emission Grams/Km'})
cars['CO2 Emission Grams/Km'] = cars['CO2 Emission Grams/Km'] / 1.60934
cars

In [None]:
# original code: cars.groupby('Make')['Make','CO2 Emission Grams/Km'].mean()
# it would return a warning: the current way of indexing the columns using multiple keys will be deprecated

# So pass a list of column names to the [] operator instead of using multiple keys
# cars.groupby('Make')[['Make','CO2 Emission Grams/Km']].mean()

# However, you do not need to index the 'Make' column
cars.groupby('Make')[['CO2 Emission Grams/Km']].mean().reset_index()

<b>show the average CO2_Emission_Grams/Km  by Brand ... sorted

In [None]:
cars.groupby('Make')[['CO2 Emission Grams/Km']].mean().sort_values('CO2 Emission Grams/Km', ascending = False).reset_index()

# (Optional) 

Use `pd.cut` or `pd.qcut` to create 4 groups (bins) of cars, by Year. We want to explore how cars have evolved decade by decade.

In [None]:
cars['Year'].describe()

In [None]:
pd.qcut(cars['Year'], 6)

In [None]:
pd.cut(cars['Year'], 4)

In [None]:
decade_labels = ['1980s', '1990s', '2000s', '2010s']

In [None]:
cars['Decade'] = pd.qcut(cars['Year'], 4, labels = decade_labels)
cars['Decade']

### Did cars consume more gas in the eighties?

show the average City_Km/Liter by year_range

In [None]:
cars.groupby('Decade')[['City MPG']].mean().reset_index()

Which brands are more environment friendly?

In [None]:
# To group by two variables, you need to input a list of the columns as a single argument.
cars.groupby(['Decade','Make'])[['CO2 Emission Grams/Km']].mean().sort_values(['Decade','CO2 Emission Grams/Km'])

In [None]:
cars.groupby(['Decade','Make'])[['CO2 Emission Grams/Km']].mean().sort_values(by=['Decade','CO2 Emission Grams/Km'], ascending = True)

Does the drivetrain affect fuel consumption?

In [None]:
cars.groupby(['Decade','Drivetrain'])[['Fuel Barrels/Year']].mean().sort_values(['Decade','Fuel Barrels/Year'])

In [None]:
# Creating two groups of transmission
# Mindful of the typo ".startswith()" insdead of ".startwith()"
auto = [i for i in cars['Transmission'].unique().tolist() if i.startswith('Auto')]
man = [i for i in cars['Transmission'].unique().tolist() if i.startswith('Man')]

In [None]:
# Merging the items in the column 'Transmission' in to two groups
cars['Transmission'] = cars['Transmission'].map(lambda x: ('Automatic' if x in auto else('Manual' if x in man else x)))
cars['Transmission'].unique()

Do cars with automatic transmission consume more fuel than cars with manual transmission?

In [None]:
cars.groupby('Transmission')[['Fuel Barrels/Year']].mean().sort_values('Fuel Barrels/Year')

Use `groupby` and `agg` with different aggregation measures for different columns:

aggregate with average City_Km/Liter and the count of the Trans

In [None]:
cars.groupby('Transmission').agg({'City MPG': 'mean', 'Transmission': 'count'})

aggregate with average City_Km/Liter and the minimum of the Trans

In [None]:
cars.groupby('Transmission').agg({'City MPG': 'mean'})