# Pandas: grouping

## Importing relevant libraries / modules

In [1]:
import pandas as pd

In [2]:
cars = pd.read_csv('vehicles.csv')
cars.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


## Understanding some of the columns

- 'Engine Displacement': total volume of air/fuel mixture an engine can draw in during one complete engine cycle. It influences the power of the car.
<br>

- 'Cylinders': where the combustion happens. More cylinders means more power, but also more gasoline consumption.<br>
<br>

- 'Transmission': allows a driver to control how much power is delivered to the car without changing how fast the engine runs. Automatic transmissions work precisely the same way as the manual ones, but have a lower error rate.<br>
<br>

- 'Drivetrain': couples the engine that produces the power to the driving wheels that use this mechanical power to rotate the axle.

## Standandardizing columns names

In [3]:
cars.columns = cars.columns.str.lower().str.replace('[ /]', '_', regex=True)
cars.columns

Index(['make', 'model', 'year', 'engine_displacement', 'cylinders',
       'transmission', 'drivetrain', 'vehicle_class', 'fuel_type',
       'fuel_barrels_year', 'city_mpg', 'highway_mpg', 'combined_mpg',
       'co2_emission_grams_mile', 'fuel_cost_year'],
      dtype='object')

## Standandardizing data in columns

**'make'**

In [4]:
cars['make'].unique()

array(['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo',
       'American Motors Corporation', 'Aston Martin', 'Audi',
       'Aurora Cars Ltd', 'Autokraft Limited', 'BMW', 'BMW Alpina',
       'Bentley', 'Bertone', 'Bill Dovell Motor Car Company',
       'Bitter Gmbh and Co. Kg', 'Bugatti', 'Buick', 'CCC Engineering',
       'CX Automotive', 'Cadillac', 'Chevrolet', 'Chrysler',
       'Consulier Industries Inc', 'Dabryan Coach Builders Inc', 'Dacia',
       'Daewoo', 'Daihatsu', 'Dodge', 'E. P. Dutton, Inc.', 'Eagle',
       'Environmental Rsch and Devp Corp', 'Evans Automobiles',
       'Excalibur Autos', 'Federal Coach', 'Ferrari', 'Fiat', 'Fisker',
       'Ford', 'GMC', 'General Motors', 'Genesis', 'Geo', 'Goldacre',
       'Grumman Allied Industries', 'Grumman Olson', 'Honda', 'Hummer',
       'Hyundai', 'Import Foreign Auto Sales Inc',
       'Import Trade Services', 'Infiniti', 'Isis Imports Ltd', 'Isuzu',
       'J.K. Motors', 'JBA Motorcars, Inc.', 'Jaguar', 'Jeep', 'Ki

In [5]:
cars['make'] = cars['make'].apply(lambda x: 'BMW' if 'BMW' in x else x)\
                           .apply(lambda x: 'AMG' if 'AM' in x else x)\
                           .apply(lambda x: 'ASC' if 'ASC' in x else x)\
                           .apply(lambda x: 'Grumman' if 'Grumman' in x else x)\
                           .apply(lambda x: 'PAS, Inc' if 'PAS' in x else x)

**'transmission'**

In [6]:
cars['transmission'].unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

In [7]:
cars['transmission'] = cars['transmission'].apply(lambda x: 'Automatic' if 'Auto' in x else 'Manual')
cars['transmission'].unique()

array(['Automatic', 'Manual'], dtype=object)

## Converting 

**Grams per mile to grams per km**

$
\begin{align}
\frac{Grams}{Mile} * \frac{1 Mile}{1.60934 Km}
\end{align}
$

**Miles per gallon (mpg) to km per liter (kml)**

$
\begin{align}
\frac{Miles}{Gallon} * \frac{1.60934Km}{1 Miles} * \frac{1 Gallon}{3.78541 Liters} 
\end{align}
$

In [8]:
cars['co2_emission_grams_km'] = cars['co2_emission_grams_mile'] / 1.60934
cars['city_kml'] = cars['city_mpg'] * (1.60934 / 3.78541)
cars['highway_kml'] = cars['highway_mpg'] * (1.60934 / 3.78541)

cars.columns

Index(['make', 'model', 'year', 'engine_displacement', 'cylinders',
       'transmission', 'drivetrain', 'vehicle_class', 'fuel_type',
       'fuel_barrels_year', 'city_mpg', 'highway_mpg', 'combined_mpg',
       'co2_emission_grams_mile', 'fuel_cost_year', 'co2_emission_grams_km',
       'city_kml', 'highway_kml'],
      dtype='object')

In [9]:
cars.drop(['co2_emission_grams_mile', 'city_mpg', 'highway_mpg', 'combined_mpg'], axis=1, inplace=True)

In [10]:
cars

Unnamed: 0,make,model,year,engine_displacement,cylinders,transmission,drivetrain,vehicle_class,fuel_type,fuel_barrels_year,fuel_cost_year,co2_emission_grams_km,city_kml,highway_kml
0,AMG,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,1950,324.831736,7.652571,7.227428
1,AMG,FJ8c Post Office,1984,4.2,6.0,Automatic,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550,424.779962,5.526857,5.526857
2,AMG,Post Office DJ5 2WD,1985,2.5,4.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,2100,345.133719,6.802286,7.227428
3,AMG,Post Office DJ8 2WD,1985,4.2,6.0,Automatic,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,2550,424.779962,5.526857,5.526857
4,ASC,GNX,1987,3.8,6.0,Automatic,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,2550,345.133719,5.952000,8.928000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428
35948,smart,fortwo coupe,2014,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,150.993575,14.454857,16.155428
35949,smart,fortwo coupe,2015,1.0,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,151.614948,14.454857,16.155428
35950,smart,fortwo coupe,2016,0.9,3.0,Automatic,Rear-Wheel Drive,Two Seaters,Premium,9.155833,1100,152.857693,14.454857,16.580571


In [11]:
cars.columns

Index(['make', 'model', 'year', 'engine_displacement', 'cylinders',
       'transmission', 'drivetrain', 'vehicle_class', 'fuel_type',
       'fuel_barrels_year', 'fuel_cost_year', 'co2_emission_grams_km',
       'city_kml', 'highway_kml'],
      dtype='object')

In [12]:
cars.describe()

Unnamed: 0,year,engine_displacement,cylinders,fuel_barrels_year,fuel_cost_year,co2_emission_grams_km,city_kml,highway_kml
count,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0
mean,2000.7164,3.338493,5.765076,17.609056,1892.598465,295.348614,7.50213,10.152686
std,10.08529,1.359395,1.755268,4.467283,506.958627,73.981118,2.027655,2.504464
min,1984.0,0.6,2.0,0.06,600.0,22.990791,2.550857,3.826286
25%,1991.0,2.2,4.0,14.699423,1500.0,245.442231,6.377143,8.502857
50%,2001.0,3.0,6.0,17.347895,1850.0,290.638922,7.227428,10.203428
75%,2010.0,4.3,6.0,20.600625,2200.0,345.133719,8.502857,11.478857
max,2017.0,8.4,16.0,47.087143,5800.0,788.877073,24.658285,25.933714


In [13]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   make                   35952 non-null  object 
 1   model                  35952 non-null  object 
 2   year                   35952 non-null  int64  
 3   engine_displacement    35952 non-null  float64
 4   cylinders              35952 non-null  float64
 5   transmission           35952 non-null  object 
 6   drivetrain             35952 non-null  object 
 7   vehicle_class          35952 non-null  object 
 8   fuel_type              35952 non-null  object 
 9   fuel_barrels_year      35952 non-null  float64
 10  fuel_cost_year         35952 non-null  int64  
 11  co2_emission_grams_km  35952 non-null  float64
 12  city_kml               35952 non-null  float64
 13  highway_kml            35952 non-null  float64
dtypes: float64(6), int64(2), object(6)
memory usage: 3.8+ 

### Gathering insights:

1. How many car makers are there? How many models? Which car maker has the most cars in the dataset?

In [14]:
cars['make'].nunique()

124

In [15]:
cars['model'].nunique()

3608

In [16]:
cars.groupby('make').size().nlargest().reset_index(name='count')

Unnamed: 0,make,count
0,Chevrolet,3643
1,Ford,2946
2,Dodge,2360
3,GMC,2347
4,Toyota,1836


2. When were these cars made?

From `cars.describe()`, all models are from between 1984 and 2017.

3. How big is the engine of these cars?

From `cars.describe()`, between 0.6 and 8.4 (which unit?). On average, 3.3 (unit?).

4. What's the frequency of different transmissions, drivetrains and fuel types?

In [17]:
cars['transmission'].value_counts().reset_index()

Unnamed: 0,index,transmission
0,Automatic,24290
1,Manual,11662


In [18]:
cars['drivetrain'].value_counts().reset_index()

Unnamed: 0,index,drivetrain
0,Front-Wheel Drive,13044
1,Rear-Wheel Drive,12726
2,4-Wheel or All-Wheel Drive,6503
3,All-Wheel Drive,2039
4,4-Wheel Drive,1058
5,2-Wheel Drive,423
6,Part-time 4-Wheel Drive,158
7,"2-Wheel Drive, Front",1


In [19]:
cars['fuel_type'].value_counts().reset_index()

Unnamed: 0,index,fuel_type
0,Regular,23587
1,Premium,9921
2,Gasoline or E85,1195
3,Diesel,911
4,Premium or E85,121
5,Midgrade,74
6,CNG,60
7,Premium and Electricity,20
8,Gasoline or natural gas,20
9,Premium Gas or Electricity,17


5. What's the car that consumes the least/most fuel?

In [20]:
cars.groupby(['make', 'model'])['fuel_barrels_year'].max().nlargest(1).reset_index()

Unnamed: 0,make,model,fuel_barrels_year
0,Lamborghini,Countach,47.087143


In [21]:
cars.groupby(['make', 'model'])['fuel_barrels_year'].min().nsmallest(1).reset_index()

Unnamed: 0,make,model,fuel_barrels_year
0,Honda,Civic Natural Gas,0.06


6. What brand has the worse CO2 Emissions on average?

In [22]:
cars.groupby('make')['co2_emission_grams_km'].mean()\
                                             .sort_values(ascending=False)\
                                             .nlargest(1)\
                                             .reset_index()\

Unnamed: 0,make,co2_emission_grams_km
0,Vector,651.919248


7. Use `pd.cut` or `pd.qcut` to create 4 groups (bins) of cars, by 'Year'. We want to explore how cars have evolved decade by decade.

In [23]:
cars['decade'] = pd.cut(cars['year'], 
                        bins = [1980,1989,1999,2009,2019],
                        labels=["80s", "90s", "00s", "10s"])
cars[['year', 'decade']]

Unnamed: 0,year,decade
0,1984,80s
1,1984,80s
2,1985,80s
3,1985,80s
4,1987,80s
...,...,...
35947,2013,10s
35948,2014,10s
35949,2015,10s
35950,2016,10s


8. Did cars consume more gas in the 80s?

In [24]:
cars.groupby('decade')[['city_kml', 'highway_kml']].mean().reset_index()

Unnamed: 0,decade,city_kml,highway_kml
0,80s,7.366539,9.518724
1,90s,7.207911,9.710109
2,00s,7.15694,9.865684
3,10s,8.310167,11.422087


9. Which brands are more environment friendly?

In [25]:
cars.groupby('make')['co2_emission_grams_km'].mean().nsmallest().reset_index()

Unnamed: 0,make,co2_emission_grams_km
0,Fisker,105.011992
1,smart,153.498052
2,Fiat,189.311494
3,Daihatsu,192.742404
4,MINI,194.935105


10. Does the drivetrain affect fuel consumption?

In [26]:
cars.groupby('drivetrain')[['city_kml', 'highway_kml']].mean().sort_values('city_kml').reset_index()

Unnamed: 0,drivetrain,city_kml,highway_kml
0,Part-time 4-Wheel Drive,6.215696,8.115385
1,4-Wheel or All-Wheel Drive,6.392049,8.34713
2,Rear-Wheel Drive,6.556574,9.023946
3,2-Wheel Drive,6.64248,8.222444
4,4-Wheel Drive,7.190861,9.668584
5,All-Wheel Drive,7.785598,10.882531
6,Front-Wheel Drive,9.002214,12.16621
7,"2-Wheel Drive, Front",10.628571,14.029714


11. Do cars with automatic transmission consume more fuel than cars with manual transmission?

In [27]:
cars.groupby("transmission")[['city_kml', 'highway_kml']].mean().sort_values('city_kml').reset_index()

Unnamed: 0,transmission,city_kml,highway_kml
0,Automatic,7.278292,9.88248
1,Manual,7.968348,10.715481


12. Use `groupby` and `aggregate` with different aggregation measures for different columns:

    a. aggregate with average 'city_kml' and the count of 'transmission'   
    
    b. aggregate with average 'city_kml' and the minimum of 'transmission'

In [28]:
cars.groupby('make').agg({'city_kml': 'mean', 'transmission': 'count'}).reset_index()

Unnamed: 0,make,city_kml,transmission
0,AMG,6.377143,4
1,ASC,5.952000,1
2,Acura,8.031258,302
3,Alfa Romeo,7.268906,41
4,American Motors Corporation,6.821610,22
...,...,...,...
119,Volkswagen,9.024236,1047
120,Volvo,7.644863,717
121,Wallace Environmental,5.287714,32
122,Yugo,9.778286,8


In [29]:
cars.groupby('make').agg({'city_kml': 'mean', 'transmission': 'min'}).reset_index()

Unnamed: 0,make,city_kml,transmission
0,AMG,6.377143,Automatic
1,ASC,5.952000,Automatic
2,Acura,8.031258,Automatic
3,Alfa Romeo,7.268906,Automatic
4,American Motors Corporation,6.821610,Automatic
...,...,...,...
119,Volkswagen,9.024236,Automatic
120,Volvo,7.644863,Automatic
121,Wallace Environmental,5.287714,Automatic
122,Yugo,9.778286,Manual
