# Exercise 04. Enrichment and transformations

In [1]:
import pandas as pd
import numpy as np
import requests

## Read the JSON file that you saved in ex02

In [2]:
pd.options.display.float_format = '{:.2f}'.format
df = pd.read_json('../data/auto.json', orient='records')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


## Enrich the dataframe using a sample from that dataframe

In [3]:
sample = df.sample(n=200, random_state=21)
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
445,M0299X197RUS,2,19200.00,Ford,Focus
22,83298C154RUS,2,8594.60,Ford,Focus
93,H957HY161RUS,1,2000.00,Ford,Focus
173,T941CC96RUS,1,2000.00,Ford,Focus
697,H966HY161RUS,1,500.00,Ford,Focus
...,...,...,...,...,...
14,8182XX154RUS,1,200.00,Ford,Focus
623,X796TH96RUS,1,500.00,Ford,Focus
498,T011MY163RUS,2,4000.00,Ford,Focus
536,T341CC96RUS,2,1000.00,Volkswagen,Passat


In [4]:
concat_rows = pd.concat(objs=(df, sample), ignore_index=True)
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
920,8182XX154RUS,1,200.00,Ford,Focus
921,X796TH96RUS,1,500.00,Ford,Focus
922,T011MY163RUS,2,4000.00,Ford,Focus
923,T341CC96RUS,2,1000.00,Volkswagen,Passat


## Enrich the `concat_rows` dataframe with a new column containing generated data

In [5]:
np.random.seed(21)
Year = pd.Series(data=np.random.randint(1980, 2019, size=len(concat_rows)), name='Year')
fines = pd.concat(objs=(concat_rows, Year), axis='columns')
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1,200.00,Ford,Focus,1996
921,X796TH96RUS,1,500.00,Ford,Focus,2002
922,T011MY163RUS,2,4000.00,Ford,Focus,1996
923,T341CC96RUS,2,1000.00,Volkswagen,Passat,2012


## Enrich the dataframe with data from another dataframe

In [6]:
surnames = pd.read_json('../data/surname.json', orient='values')
surnames.columns = surnames.iloc[0]
surnames.drop(0, inplace=True)
surnames


Unnamed: 0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
5,BAILEY,277845,72
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


In [7]:
car_numbers = concat_rows.drop_duplicates('CarNumber')['CarNumber']
car_numbers = car_numbers.to_frame(name='CarNumber').reset_index(drop=True)
car_numbers

Unnamed: 0,CarNumber
0,Y163O8161RUS
1,E432XX77RUS
2,7184TT36RUS
3,X582HE161RUS
4,92918M178RUS
...,...
526,O136HO197RUS
527,O22097197RUS
528,M0309X197RUS
529,O673E8197RUS


In [8]:
surnames = surnames['NAME'].sample(n=len(car_numbers), random_state=21, replace=True)
surnames = surnames.to_frame(name='SURNAME').reset_index(drop=True)
surnames

Unnamed: 0,SURNAME
0,RICHARDSON
1,ROSS
2,MORGAN
3,BAILEY
4,LOPEZ
...,...
526,CAMPBELL
527,HALL
528,BAKER
529,DIAZ


In [9]:
owners = pd.concat(objs=(car_numbers, surnames), axis='columns')
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
526,O136HO197RUS,CAMPBELL
527,O22097197RUS,HALL
528,M0309X197RUS,BAKER
529,O673E8197RUS,DIAZ


In [10]:
new_obs = pd.DataFrame({'CarNumber': ['A000AA50', 'B111BB50', 'C222CC50', 'X999XX50', 'A123BC50'],
                        'Refund': [1, 2, 1, 2, 1],
                        'Fines': [3000, 4000, 5000, 6000, 7000],
                        'Make': ['Toyota', 'Toyota', 'Toyota', 'Toyota', 'Toyota'],
                        'Model': ['Camry', 'Camry', 'Camry', 'Camry', 'Camry'],
                        'Year': [2007, 2007, 2007, 2007, 2007]})
fines = pd.concat(objs=(fines, new_obs), ignore_index=True)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,A000AA50,1,3000.00,Toyota,Camry,2007
926,B111BB50,2,4000.00,Toyota,Camry,2007
927,C222CC50,1,5000.00,Toyota,Camry,2007
928,X999XX50,2,6000.00,Toyota,Camry,2007


In [11]:
owners.drop(owners.tail(20).index, inplace=True)
new_obs = pd.DataFrame({'CarNumber': ['Q666QQ70', 'W000WW70', 'R888RR70'],
                        'SURNAME': ['HILL', 'MORGAN', 'MARTIN']})
owners = pd.concat(objs=(owners, new_obs), ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
511,Q666QQ70,HILL
512,W000WW70,MORGAN


In [12]:
inner_df = pd.merge(fines, owners, how='inner')
inner_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,BAILEY
4,92918M178RUS,1,5700.00,Ford,Focus,2014,LOPEZ
...,...,...,...,...,...,...,...
894,8182XX154RUS,1,200.00,Ford,Focus,1996,SMITH
895,X796TH96RUS,1,500.00,Ford,Focus,2002,WATSON
896,T011MY163RUS,2,4000.00,Ford,Focus,1996,SANDERS
897,T341CC96RUS,2,1000.00,Volkswagen,Passat,2012,PEREZ


In [13]:
outer_df = pd.merge(fines, owners, how='outer')
outer_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,704687163RUS,2.00,1400.00,Ford,Focus,2014.00,ADAMS
1,704787163RUS,2.00,2800.00,Ford,Focus,2005.00,MORGAN
2,704987163RUS,2.00,8594.60,Ford,Focus,2014.00,MITCHELL
3,705287163RUS,2.00,2000.00,Ford,Focus,1990.00,GOMEZ
4,705387163RUS,2.00,700.00,Ford,Focus,2005.00,STEWART
...,...,...,...,...,...,...,...
928,Y973O8197RUS,2.00,8594.60,Ford,Focus,2005.00,YOUNG
929,Y973O8197RUS,1.00,34800.00,Ford,Focus,2013.00,YOUNG
930,Y973O8197RUS,1.00,69600.00,Ford,Focus,1989.00,YOUNG
931,Y973O8197RUS,1.00,34800.00,Ford,Focus,2009.00,YOUNG


In [14]:
left_df = pd.merge(fines, owners, how='left')
left_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,BAILEY
4,92918M178RUS,1,5700.00,Ford,Focus,2014,LOPEZ
...,...,...,...,...,...,...,...
925,A000AA50,1,3000.00,Toyota,Camry,2007,
926,B111BB50,2,4000.00,Toyota,Camry,2007,
927,C222CC50,1,5000.00,Toyota,Camry,2007,
928,X999XX50,2,6000.00,Toyota,Camry,2007,


In [15]:
right_df = pd.merge(fines, owners, how='right')
right_df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1999.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,1992.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2013.00,HILL
898,7608EE777RUS,1.00,4000.00,Skoda,Octavia,1987.00,HILL
899,Q666QQ70,,,,,,HILL
900,W000WW70,,,,,,MORGAN


## Create a pivot table from the `fines` dataframe. It should look like this (the values are the sums of the fines), but with all the years. The values may be different for you

In [16]:
pd.pivot_table(fines, columns='Year', values='Fines', index=['Make', 'Model'], aggfunc={'Fines': np.sum})

  pd.pivot_table(fines, columns='Year', values='Fines', index=['Make', 'Model'], aggfunc={'Fines': np.sum})


Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,89194.6,266783.8,107283.8,147289.2,106000.0,307494.6,69700.0,98189.2,69667.6,200889.2,...,159894.6,96000.0,117194.6,152989.2,297378.4,90378.4,172700.0,110789.2,114089.2,107800.0
Ford,Mondeo,,,46200.0,,,,,,,,...,,,,,41100.0,,,,8600.0,
Skoda,Octavia,13794.6,1900.0,8894.6,,1300.0,153594.6,,6000.0,5100.0,8594.6,...,,3000.0,3000.0,1700.0,11800.0,18900.0,16394.6,35700.0,2400.0,153200.0
Toyota,Camry,12000.0,,1000.0,8594.6,1000.0,,19800.0,,,800.0,...,,22400.0,,7500.0,,,,800.0,,
Toyota,Corolla,,6800.0,,12800.0,,4400.0,,54300.0,,7800.0,...,8594.6,6000.0,3400.0,,,,30300.0,,6900.0,
Volkswagen,Golf,20800.0,8594.6,5000.0,200.0,,168000.0,,300.0,,300.0,...,,,,,,13900.0,4600.0,,,1000.0
Volkswagen,Jetta,,1000.0,,,,9000.0,,,46000.0,4000.0,...,,,,,,,,,,
Volkswagen,Passat,900.0,12500.0,,1100.0,8594.6,,16000.0,2000.0,8594.6,,...,3200.0,9500.0,,1000.0,1600.0,15000.0,,,,
Volkswagen,Touareg,,,,,,,,,,,...,5800.0,,,,,,,,,


## Save both the `fines` and `owners` dataframes to CSV files without an index

In [17]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)