In [1]:
import pandas as pd
import numpy as np
import requests

## read the JSON file

In [2]:
df = pd.read_json('../data/auto.json')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.000000,Ford,Focus
1,E432XX77RUS,1,6500.000000,Toyota,Camry
2,7184TT36RUS,1,2100.000000,Ford,Focus
3,X582HE161RUS,2,2000.000000,Ford,Focus
4,92918M178RUS,1,5700.000000,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.000000,Ford,Focus
721,M0309X197RUS,1,22300.000000,Ford,Focus
722,O673E8197RUS,2,600.000000,Ford,Focus
723,8610T8154RUS,1,2000.000000,Ford,Focus


## define the format of float cell

In [3]:
pd.options.display.float_format = '{:,.2f}'.format
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


## enrich the dataframe using a sample from that dataframe

## create a sample with 200 new observations with random_state = 21 

In [4]:
sample = df.sample(n=200, replace=True, random_state=21)
sample['Refund'] = np.random.choice(df['Refund'], size=200)
sample['Fines'] = np.random.choice(df['Fines'], size=200)
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
207,Y351O8197RUS,1,8594.59,Ford,Focus
48,H917TC36RUS,1,2100.00,Ford,Focus
368,C589EY154RUS,2,1000.00,Ford,Focus
120,K846YE77RUS,1,3800.00,Volkswagen,Passat
419,X4108H125RUS,2,1000.00,Ford,Focus
...,...,...,...,...,...
587,M942OT152RUS,1,2000.00,Ford,Focus
595,Y187O8161RUS,2,24000.00,Ford,Focus
365,7064C8197RUS,1,2000.00,Volkswagen,Passat
474,8437XX154RUS,2,4600.00,Ford,Focus


## concatenate the sample with the initial dataframe to a new dataframe concat_rows

In [5]:
concat_rows = pd.concat([df, sample], ignore_index=True)
concat_rows.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
dtype: int64

## enrich the dataframe concat_rows by a new column with the data generated

## create a series with the name Year using random integers from 1980 to 2019

In [6]:
np.random.seed(21)
years = np.random.randint(1980, 2020, size=len(concat_rows))
Year = pd.Series(years, name='Year')
Year

0      1989
1      1995
2      1984
3      2015
4      2014
       ... 
920    1981
921    1992
922    2007
923    2005
924    1997
Name: Year, Length: 925, dtype: int64

## concatenate the series with the dataframe and name it fines

In [7]:
fines = pd.concat([concat_rows, Year], axis='columns')
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
920,M942OT152RUS,1,2000.00,Ford,Focus,1981
921,Y187O8161RUS,2,24000.00,Ford,Focus,1992
922,7064C8197RUS,1,2000.00,Volkswagen,Passat,2007
923,8437XX154RUS,2,4600.00,Ford,Focus,2005


## enrich the dataframe with the data from another dataframe

In [8]:
surname = pd.read_json('../data/surname.json', orient='values')
surname.columns = surname.iloc[0]
surname = surname[1:].reset_index(drop=True)
surname['NAME'] = surname['NAME'].str.replace(r'[^\w\s]', '', regex=True)
surname

Unnamed: 0,NAME,COUNT,RANK
0,ADAMS,427865,42
1,ALLEN,482607,33
2,ALVAREZ,233983,92
3,ANDERSON,784404,15
4,BAILEY,277845,72
...,...,...,...
95,WILLIAMS,1625252,3
96,WILSON,801882,14
97,WOOD,250715,84
98,WRIGHT,458980,35


## get the most popular surnames

In [9]:
surname['COUNT'] = surname['COUNT'].astype(int)
popular_surnames = surname.sort_values(by='COUNT', ascending=False)
popular_surnames

Unnamed: 0,NAME,COUNT,RANK
84,SMITH,2442977,1
40,JOHNSON,1932812,2
95,WILLIAMS,1625252,3
8,BROWN,1437026,4
41,JONES,1425470,5
...,...,...,...
59,MYERS,229895,96
47,LONG,229374,97
79,ROSS,229368,98
24,FOSTER,227764,99


In [10]:
car_numbers = concat_rows.drop_duplicates('CarNumber')['CarNumber']
car_numbers = car_numbers.to_frame(name='CarNumber').reset_index(drop=True)
surnames=popular_surnames
surnames = surnames.sample(n=len(car_numbers), random_state=21, replace=True)
surnames = surnames['NAME'].to_frame(name='SURNAME').reset_index(drop=True)
unique_car_numbers = df['CarNumber'].unique()
sample_surnames = surname['NAME'].sample(n=len(unique_car_numbers), random_state=42,replace=True).reset_index(drop=True)
owners = pd.DataFrame({
    'CarNumber': unique_car_numbers,
    'SURNAME': sample_surnames
})
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,MENDOZA
1,E432XX77RUS,WARD
2,7184TT36RUS,COLLINS
3,X582HE161RUS,REED
4,92918M178RUS,NELSON
...,...,...
526,O136HO197RUS,PRICE
527,O22097197RUS,YOUNG
528,M0309X197RUS,HERNANDEZ
529,O673E8197RUS,MENDOZA


## append 5 more observations to the fines dataframe 

In [11]:
data = {
    'CarNumber': [
        'S21RUS', 'S22RUS', 
        'S23RUS', 'S24RUS', 
        'S25RUS'
    ],
    'Refund': [1, 1, 2, 1, 1],
    'Fines': [
        1500.00, 2300.00, 
        4000.00, 3500.00, 
        5600.00
    ],
    'Make': [
        'Honda', 'Nissan', 
        'Chevrolet', 'Hyundai', 
        'Kia'
    ],
    'Model': [
        'Civic', 'Altima', 
        'Malibu', 'Elantra', 
        'Soul'
    ],
    'Year': [2020, 2018, 2019, 2021, 2022]
}
data = pd.DataFrame(data, columns=fines.columns)
fines = pd.concat([fines, data], ignore_index=True)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,S21RUS,1,1500.00,Honda,Civic,2020
926,S22RUS,1,2300.00,Nissan,Altima,2018
927,S23RUS,2,4000.00,Chevrolet,Malibu,2019
928,S24RUS,1,3500.00,Hyundai,Elantra,2021


## delete the dataframe last 20 observations from the owners and add 3 new observations

In [12]:
owners.drop(owners.tail(20).index, inplace=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,MENDOZA
1,E432XX77RUS,WARD
2,7184TT36RUS,COLLINS
3,X582HE161RUS,REED
4,92918M178RUS,NELSON
...,...,...
506,T914CT197RUS,GRAY
507,E41977152RUS,RODRIGUEZ
508,9464EX178RUS,WALKER
509,O50197197RUS,PRICE


In [13]:
data_new = {'CarNumber':['S26RUS','ABIBARUS','AGORARUS'],'SURNAME':['POLLER','SMITH','FORD']}
data_new = pd.DataFrame(data_new, columns=owners.columns)
owners = pd.concat([owners, data_new], ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,MENDOZA
1,E432XX77RUS,WARD
2,7184TT36RUS,COLLINS
3,X582HE161RUS,REED
4,92918M178RUS,NELSON
...,...,...
509,O50197197RUS,PRICE
510,7608EE777RUS,LEWIS
511,S26RUS,POLLER
512,ABIBARUS,SMITH


## join both dataframes

## the new dataframe should have only the car numbers that exist in both dataframes

In [14]:
first_update = pd.merge(left=fines, right=owners, on='CarNumber', how='inner')
first_update

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,MENDOZA
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,WARD
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,COLLINS
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,REED
4,92918M178RUS,1,5700.00,Ford,Focus,2014,NELSON
...,...,...,...,...,...,...,...
898,M942OT152RUS,1,2000.00,Ford,Focus,1981,BAILEY
899,Y187O8161RUS,2,24000.00,Ford,Focus,1992,LEWIS
900,7064C8197RUS,1,2000.00,Volkswagen,Passat,2007,FLORES
901,8437XX154RUS,2,4600.00,Ford,Focus,2005,CASTILLO


## the new dataframe should have all the car numbers that exist in both dataframes

In [15]:
second_update = pd.merge(left=fines, right=owners, on='CarNumber', how='outer')
second_update

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,704687163RUS,2.00,1400.00,Ford,Focus,2004.00,NGUYEN
1,704787163RUS,2.00,2800.00,Ford,Focus,1992.00,ALVAREZ
2,704987163RUS,2.00,8594.59,Ford,Focus,1985.00,ADAMS
3,705287163RUS,2.00,2000.00,Ford,Focus,1980.00,STEWART
4,705387163RUS,2.00,700.00,Ford,Focus,1987.00,RAMIREZ
...,...,...,...,...,...,...,...
928,Y969O8197RUS,2.00,7800.00,Ford,Focus,1992.00,LONG
929,Y973O8197RUS,2.00,8594.59,Ford,Focus,2005.00,BENNETT
930,Y973O8197RUS,1.00,34800.00,Ford,Focus,2003.00,BENNETT
931,Y973O8197RUS,1.00,69600.00,Ford,Focus,2017.00,BENNETT


## the new dataframe should have only the car numbers from the fines dataframe

In [16]:
third_update = pd.merge(left=fines, right=owners, on='CarNumber', how='left')
third_update

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,MENDOZA
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,WARD
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,COLLINS
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,REED
4,92918M178RUS,1,5700.00,Ford,Focus,2014,NELSON
...,...,...,...,...,...,...,...
925,S21RUS,1,1500.00,Honda,Civic,2020,
926,S22RUS,1,2300.00,Nissan,Altima,2018,
927,S23RUS,2,4000.00,Chevrolet,Malibu,2019,
928,S24RUS,1,3500.00,Hyundai,Elantra,2021,


## the new dataframe should have only the car numbers from the owners dataframe

In [17]:
fourth_update = pd.merge(left=fines, right=owners, on='CarNumber', how='right')
fourth_update


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,MENDOZA
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,MENDOZA
2,Y163O8161RUS,2.00,7500.00,Ford,Focus,2019.00,MENDOZA
3,Y163O8161RUS,2.00,1500.00,Ford,Focus,2017.00,MENDOZA
4,Y163O8161RUS,2.00,4000.00,Ford,Focus,2017.00,MENDOZA
...,...,...,...,...,...,...,...
901,O50197197RUS,2.00,7800.00,Ford,Focus,1992.00,PRICE
902,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2000.00,LEWIS
903,S26RUS,,,,,,POLLER
904,ABIBARUS,,,,,,SMITH


## create a pivot table from the fines dataframe

In [18]:
pd.pivot_table(fines, columns='Year', values='Fines', index=['Make', 'Model'], aggfunc={'Fines': 'sum'})

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Chevrolet,Malibu,,,,,,,,,,,...,,,,,,,4000.0,,,
Ford,Focus,69594.59,440089.17,144578.35,59600.0,87894.59,228778.35,87094.59,99594.59,129489.17,69489.17,...,173894.59,120194.59,209989.17,105994.59,265094.59,321989.17,83900.0,,,
Ford,Mondeo,,,,,,,,,,8600.0,...,,,,46200.0,,,,,,
Honda,Civic,,,,,,,,,,,...,,,,,,,,1500.0,,
Hyundai,Elantra,,,,,,,,,,,...,,,,,,,,,3500.0,
Kia,Soul,,,,,,,,,,,...,,,,,,,,,,5600.0
Nissan,Altima,,,,,,,,,,,...,,,,,,2300.0,,,,
Skoda,Octavia,32294.59,,6900.0,11594.59,1100.0,10294.59,600.0,21400.0,,91400.0,...,25494.59,11700.0,46394.59,300.0,,156200.0,9500.0,,,
Toyota,Camry,12300.0,8594.59,,7200.0,,,,,,22400.0,...,,,,1000.0,19200.0,13000.0,18100.0,,,
Toyota,Corolla,,,2000.0,,,,1100.0,8000.0,,4000.0,...,,,,,9600.0,,500.0,,,


## save both the fines and owners dataframes to CSV files without an index

In [19]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)