## Imports

In [1]:
import pandas as pd
import numpy as np
import requests

## read the JSON file that you saved in ex02

In [2]:
df = pd.read_json('../data/auto.json', orient='records')

In [3]:
pd.options.display.float_format = '{:.2f}'.format

In [4]:
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.0,Ford,Focus
1,E432XX77RUS,1,6500.0,Toyota,Camry
2,7184TT36RUS,1,2100.0,Ford,Focus
3,X582HE161RUS,2,2000.0,Ford,Focus
4,92918M178RUS,1,5700.0,Ford,Focus


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  725 non-null    object 
 1   Refund     725 non-null    int64  
 2   Fines      725 non-null    float64
 3   Make       725 non-null    object 
 4   Model      716 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 28.4+ KB


## enrich the dataframe using a sample from that dataframe

In [6]:
sample_200 = df.sample(n=200, random_state=21).reset_index(drop=True)
sample_200.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,M0299X197RUS,2,19200.0,Ford,Focus
1,83298C154RUS,2,8594.59,Ford,Focus
2,H957HY161RUS,1,2000.0,Ford,Focus
3,T941CC96RUS,1,2000.0,Ford,Focus
4,H966HY161RUS,1,500.0,Ford,Focus


In [7]:
concat_rows = pd.concat([df, sample_200])
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
195,8182XX154RUS,1,200.00,Ford,Focus
196,X796TH96RUS,1,500.00,Ford,Focus
197,T011MY163RUS,2,4000.00,Ford,Focus
198,T341CC96RUS,2,1000.00,Volkswagen,Passat


In [8]:
concat_rows.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
dtype: int64

## enrich the dataframe concat_rows by a new column with the data generated

In [9]:
np.random.seed(21)
Year = pd.Series([np.random.randint(1980, 2020)
                 for i in range(len(concat_rows))], name='Year')
fines = concat_rows
fines['Year'] = Year
fines.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014


In [10]:
fines.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014


In [11]:
fines.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
Year         925
dtype: int64

## enrich the dataframe with the data from another dataframe

In [12]:
owners = pd.read_json('../data/surname.json')
owners.head()

Unnamed: 0,0,1,2
0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15


In [13]:
owners.columns = owners.iloc[0]
owners = owners.drop(0)
owners.head()

Unnamed: 0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
5,BAILEY,277845,72


In [14]:
owners = owners[owners['NAME'].str.contains('[a-zA-Z]')]
surname = [np.random.choice(owners['NAME']) for x in range(len(fines['CarNumber'].drop_duplicates()))]
owners = pd.DataFrame(fines['CarNumber'].drop_duplicates().copy())
owners['SURNAME'] = surname
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,BAKER
1,E432XX77RUS,CRUZ
2,7184TT36RUS,MARTIN
3,X582HE161RUS,REED
4,92918M178RUS,COOPER
...,...,...
715,O136HO197RUS,HOWARD
719,O22097197RUS,EVANS
721,M0309X197RUS,ROGERS
722,O673E8197RUS,WILSON


In [15]:
def random_num_car():
    string = np.random.choice(['A', 'B', 'E', 'K', 'M', 'H', 'O', 'P', 'C', 'T', 'Y', 'X'])\
        + str(np.random.randint(100, 999))\
        + np.random.choice(['A', 'B', 'E', 'K', 'M', 'H', 'O', 'P', 'C', 'T', 'Y', 'X'])\
        + np.random.choice(['A', 'B', 'E', 'K', 'M', 'H', 'O', 'P', 'C', 'T', 'Y', 'X'])\
        + '77RUS'
    return string

In [16]:
new_fines = pd.DataFrame(
    [fines[['Make', 'Model']].iloc[np.random.randint(0, len(fines))] for i in range(5)])
new_fines['CarNumber'] = [random_num_car() for i in range(5)]
new_fines['Refund'] = [np.random.choice(fines['Refund']) for i in range(5)]
new_fines['Fines'] = [np.random.choice(fines['Fines']) for i in range(5)]
new_fines['Year'] = [np.random.choice(fines['Year']) for i in range(5)]
new_fines = new_fines[['CarNumber', 'Refund',
                       'Fines', 'Make', 'Model', 'Year']]
new_fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
397,K106AM77RUS,2,19200.0,Toyota,Camry,1996
176,Y343TH77RUS,2,6700.0,Ford,Focus,2016
386,O380BY77RUS,2,600.0,Ford,Focus,1991
720,X134YY77RUS,1,500.0,Ford,Focus,2018
325,H980KC77RUS,2,1500.0,Ford,Focus,2011


In [17]:
fines = pd.concat([fines, new_fines])
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
397,K106AM77RUS,2,19200.00,Toyota,Camry,1996
176,Y343TH77RUS,2,6700.00,Ford,Focus,2016
386,O380BY77RUS,2,600.00,Ford,Focus,1991
720,X134YY77RUS,1,500.00,Ford,Focus,2018


In [18]:
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,BAKER
1,E432XX77RUS,CRUZ
2,7184TT36RUS,MARTIN
3,X582HE161RUS,REED
4,92918M178RUS,COOPER
...,...,...
715,O136HO197RUS,HOWARD
719,O22097197RUS,EVANS
721,M0309X197RUS,ROGERS
722,O673E8197RUS,WILSON


In [19]:
owners = owners[:-20]
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,BAKER
1,E432XX77RUS,CRUZ
2,7184TT36RUS,MARTIN
3,X582HE161RUS,REED
4,92918M178RUS,COOPER
...,...,...
681,T914CT197RUS,BAILEY
682,E41977152RUS,EDWARDS
684,9464EX178RUS,RIVERA
685,O50197197RUS,JACKSON


In [20]:
add_owners = pd.DataFrame({'CarNumber': [random_num_car() for i in range(3)],
                           'SURNAME': [np.random.choice(owners['SURNAME']) for i in range(3)]})
owners = pd.concat([owners, add_owners])

In [21]:
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,BAKER
1,E432XX77RUS,CRUZ
2,7184TT36RUS,MARTIN
3,X582HE161RUS,REED
4,92918M178RUS,COOPER
...,...,...
685,O50197197RUS,JACKSON
686,7608EE777RUS,MILLER
0,B511EC77RUS,COX
1,H961OC77RUS,KELLY


In [22]:
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
397,K106AM77RUS,2,19200.00,Toyota,Camry,1996
176,Y343TH77RUS,2,6700.00,Ford,Focus,2016
386,O380BY77RUS,2,600.00,Ford,Focus,1991
720,X134YY77RUS,1,500.00,Ford,Focus,2018


In [23]:
result1 = fines.merge(owners, how='inner', on='CarNumber')
result1

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,BAKER
1,Y163O8161RUS,2,1600.00,Ford,Focus,1980,BAKER
2,E432XX77RUS,1,6500.00,Toyota,Camry,1995,CRUZ
3,E432XX77RUS,2,13000.00,Toyota,Camry,2018,CRUZ
4,7184TT36RUS,1,2100.00,Ford,Focus,1984,MARTIN
...,...,...,...,...,...,...,...
894,E41977152RUS,2,2400.00,Ford,Focus,1989,EDWARDS
895,9464EX178RUS,2,2100.00,Ford,Focus,1988,RIVERA
896,O50197197RUS,2,7800.00,Ford,Focus,1992,JACKSON
897,7608EE777RUS,1,4000.00,Skoda,Octavia,2000,MILLER


In [24]:
result2 = fines.merge(owners, how='outer', on='CarNumber')
result2

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,BAKER
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,BAKER
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,CRUZ
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,CRUZ
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MARTIN
...,...,...,...,...,...,...,...
928,X134YY77RUS,1.00,500.00,Ford,Focus,2018.00,
929,H980KC77RUS,2.00,1500.00,Ford,Focus,2011.00,
930,B511EC77RUS,,,,,,COX
931,H961OC77RUS,,,,,,KELLY


In [25]:
result3 = fines.merge(owners, how='left', on='CarNumber')
result3

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,BAKER
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,CRUZ
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,MARTIN
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,REED
4,92918M178RUS,1,5700.00,Ford,Focus,2014,COOPER
...,...,...,...,...,...,...,...
925,K106AM77RUS,2,19200.00,Toyota,Camry,1996,
926,Y343TH77RUS,2,6700.00,Ford,Focus,2016,
927,O380BY77RUS,2,600.00,Ford,Focus,1991,
928,X134YY77RUS,1,500.00,Ford,Focus,2018,


In [26]:
result4 = fines.merge(owners, how='right', on='CarNumber')
result4

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,BAKER
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,BAKER
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,CRUZ
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,CRUZ
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MARTIN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2000.00,MILLER
898,7608EE777RUS,1.00,4000.00,Skoda,Octavia,1996.00,MILLER
899,B511EC77RUS,,,,,,COX
900,H961OC77RUS,,,,,,KELLY


## create a pivot table from the fines dataframe, it should look like this (the values are the sums of the fines), but with all the years (the values may be different for you):

In [27]:
pivot_table = pd.pivot_table(result1,
                             values='Fines',
                             index=['Make', 'Model'],
                             columns=['Year'],
                             aggfunc=np.sum)
pivot_table

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,74094.59,423283.76,161883.76,97194.59,96589.17,146972.93,90394.59,80600.0,84194.59,73300.0,...,125778.35,90789.17,127789.17,148289.17,117394.59,221000.0,96989.17,233594.59,257994.59,84400.0
Ford,Mondeo,,,,,,,,,,8600.0,...,,,34400.0,,,,46200.0,,,
Skoda,Octavia,1900.0,8594.59,6900.0,21189.17,,10294.59,600.0,5200.0,,91400.0,...,3600.0,500.0,500.0,15594.59,300.0,46394.59,300.0,,156200.0,9500.0
Toyota,Camry,12000.0,8594.59,,7200.0,,,22400.0,,,22400.0,...,,,8594.59,,,,,,14000.0,18100.0
Toyota,Corolla,,,2000.0,,,,,8000.0,,11600.0,...,24000.0,8594.59,30300.0,,,,3400.0,9600.0,,
Volkswagen,Golf,31900.0,,,8594.59,300.0,24300.0,,9300.0,,5800.0,...,,300.0,,,,2300.0,,,,300.0
Volkswagen,Jetta,,4000.0,,,,,,,,,...,,,,,,,,,,
Volkswagen,Passat,,11100.0,,3200.0,25000.0,5000.0,15000.0,12300.0,,,...,5700.0,,,,1600.0,1600.0,2100.0,,,9900.0
Volkswagen,Touareg,,,,,,5800.0,,,,,...,6300.0,,,,1300.0,500.0,,,,


## save both the fines and owners dataframes to CSV files without an index

In [28]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)