## read the JSON file that you saved in ex02

In [35]:
import pandas as pd
import numpy as np
import requests


- one of the columns has the float type, so let us define the format of it in
pandas using pd.options.display.float_format: floats should be displayed with
two decimals
- there are values missing from the Model, do not do anything with them

In [37]:
pd.options.display.float_format = '{:.2f}'.format
df = pd.read_json('../../data-samples/auto.json', orient='records')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


## enrich the dataframe using a sample from that dataframe

- create a sample with 200 new observations with random_state = 21

- the sample should not have new combinations of the car number, make and model, so the whole dataset will be consistent in these terms there are no restrictions on the refund and fines, you can take any value from these columns at random and use it towards any car number

- concatenate the sample with the initial dataframe to a new dataframe concat_rows

In [None]:
np.random.seed(21)
sample_size = 200
sample_df = df.sample(sample_size, replace=True, random_state=21)
sample_df['Refund'] = np.random.choice(df['Refund'], size=sample_size, replace=True)
sample_df['Fines'] = np.random.choice(df['Fines'], size=sample_size, replace=True)
concat_rows = pd.concat([df, sample_df], ignore_index=True)
concat_rows.head()
concat_rows.count()


CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
dtype: int64

### enrich the dataframe concat_rows by a new column with the data generated
- use np.random.seed(21) before generating the years
- create a series with the name Year using random integers from 1980 to 2019
- concatenate the series with the dataframe and name it fines

In [None]:
np.random.seed(21)
year_series = pd.Series(np.random.randint(1980, 2020, size=len(concat_rows)), name='Year')
fines = pd.concat([concat_rows, year_series], axis=1)
fines.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
Year         925
dtype: int64

### enrich the dataframe with the data from another dataframe

#### create a new dataframe with the car numbers and their owners

- get the most popular surnames (you can find the file surname.json in the attachments) in the US

In [None]:
surname_df = pd.read_json('../../datasets/surname.json')
surname_df.head()

Unnamed: 0,0,1,2
0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15


In [None]:
surname_df.columns = ['NAME', 'COUNT', 'RANK']
surname_df.drop(index=0, inplace=True)
most_popular_surnames = surname_df.sort_values(by='COUNT', ascending=False)
most_popular_surnames.head()

Unnamed: 0,NAME,COUNT,RANK
49,LOPEZ,874523,12
28,GONZALEZ,841025,13
97,WILSON,801882,14
4,ANDERSON,784404,15
88,THOMAS,756142,16


- create a new series with the surnames (they should not have special characters like commas, brackets, etc.) from the data you gathered, the count
should be equal to the number of unique car numbers using the sample(use random_state = 21)
- create the dataframe owners with 2 columns: CarNumber and SURNAME

In [None]:
surnames_cleaned = surname_df['NAME'].str.replace(r'[^\w\s]', '', regex=True)
surnames_cleaned.head()

1       ADAMS
2       ALLEN
3     ALVAREZ
4    ANDERSON
5      BAILEY
Name: NAME, dtype: object

In [None]:
np.random.seed(21)

In [None]:
unique_car_numbers = concat_rows['CarNumber'].unique()

num_unique_car_numbers = len(concat_rows['CarNumber'].unique())
surnames_extended = surnames_cleaned.tolist() * ((num_unique_car_numbers // len(surnames_cleaned)) + 1)

surname_series = pd.Series(np.random.choice(surnames_extended, size=len(unique_car_numbers), replace=False), name='SURNAME')
owners = pd.DataFrame({
    'CarNumber': unique_car_numbers,
    'SURNAME': surname_series
})
owners.head()
len(owners)

531

#### append 5 more observations to the fines dataframe (come up with your own ideas of CarNumber, etc.)

In [None]:
new_observations = pd.DataFrame({
    'CarNumber': ['A123BC456RUS', 'B234CD567RUS', 'O630MX750RUS', 'D456EF789RUS', 'E567FG890RUS'],
    'Refund': [2.5, 0.5, 3.0, 1.5, 2.0],
    'Fines': [2200, 1800, 45000, 3000, 2700],
    'Make': ['Toyota', 'Honda', 'Ford', 'Chevrolet', 'Nissan'],
    'Model': ['Corolla', 'Civic', 'Focus', 'Malibu', 'Altima'],
    'Year': [2010, 2015, 2018, 2020, 2012]
})
fines = pd.concat([fines, new_observations], ignore_index=True)
fines.head()
len(fines)

930

#### delete the dataframe last 20 observations from the owners and add 3 new observations 
(they are not the same as those you add to the fines dataframe)

In [None]:
owners = owners.iloc[:-20]
new_owner_observations = pd.DataFrame({
    'CarNumber': ['F678GH901RUS', 'G890IJ234RUS', 'H234JK567RUS'],
    'SURNAME': ['Clark', 'Lewis', 'Walker']
})
owners = pd.concat([owners, new_owner_observations], ignore_index=True)
owners.head()

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,EVANS
1,E432XX77RUS,ORTIZ
2,7184TT36RUS,GOMEZ
3,X582HE161RUS,LEWIS
4,92918M178RUS,LEE


#### join both dataframes:

- the new dataframe should have only the car numbers that exist in both dataframes


In [38]:
inner_join = pd.merge(fines, owners, on='CarNumber', how='inner')
inner_join.head()
inner_join.count()

CarNumber    903
Refund       903
Fines        903
Make         903
Model        892
Year         903
SURNAME      903
dtype: int64

- the new dataframe should have all the car numbers that exist in both dataframes

In [39]:
outer_join = pd.merge(fines, owners, on='CarNumber', how='outer')
outer_join.head()
outer_join.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 933 entries, 0 to 932
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  933 non-null    object 
 1   Refund     930 non-null    float64
 2   Fines      930 non-null    float64
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    float64
 6   SURNAME    906 non-null    object 
dtypes: float64(3), object(4)
memory usage: 51.2+ KB


- the new dataframe should have only the car numbers from the fines dataframe

In [40]:
left_join = pd.merge(fines, owners, on='CarNumber', how='left')
left_join.head()
left_join.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  930 non-null    object 
 1   Refund     930 non-null    float64
 2   Fines      930 non-null    float64
 3   Make       930 non-null    object 
 4   Model      919 non-null    object 
 5   Year       930 non-null    int64  
 6   SURNAME    903 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 51.0+ KB



- the new dataframe should have only the car numbers from the owners dataframe

In [41]:
right_join = pd.merge(fines, owners, on='CarNumber', how='right')
right_join.head()
right_join.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 906 entries, 0 to 905
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  906 non-null    object 
 1   Refund     903 non-null    float64
 2   Fines      903 non-null    float64
 3   Make       903 non-null    object 
 4   Model      892 non-null    object 
 5   Year       903 non-null    float64
 6   SURNAME    906 non-null    object 
dtypes: float64(3), object(4)
memory usage: 49.7+ KB


#### create a pivot table from the fines dataframe, it should look like this (the values are the sums of the fines), but with all the years (the values may be different for you):

In [None]:
pivot_table = fines.pivot_table(
    values='Fines',
    index=['Make', 'Model'],
    columns='Year',
    aggfunc='sum',
    fill_value=0
)
pivot_table

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Chevrolet,Malibu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3000.0
Ford,Focus,57289.17,649989.17,167578.35,71489.17,102994.59,112783.76,93494.59,96700.0,168594.59,82300.0,...,92489.17,92300.0,195094.59,98794.59,202800.0,91194.59,304900.0,334294.59,79700.0,0.0
Ford,Mondeo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8600.0,...,0.0,34400.0,0.0,0.0,0.0,46200.0,0.0,0.0,0.0,0.0
Honda,Civic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1800.0,0.0,0.0,0.0,0.0,0.0
Nissan,Altima,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Skoda,Octavia,27900.0,0.0,6900.0,11594.59,3000.0,10294.59,600.0,34000.0,0.0,91400.0,...,500.0,500.0,14494.59,14800.0,46394.59,300.0,0.0,156200.0,9500.0,0.0
Toyota,Camry,13000.0,8594.59,0.0,7200.0,0.0,0.0,0.0,0.0,0.0,22400.0,...,0.0,10594.59,0.0,0.0,0.0,9600.0,6000.0,13000.0,18100.0,0.0
Toyota,Corolla,0.0,0.0,2000.0,0.0,0.0,0.0,500.0,8000.0,0.0,4000.0,...,8594.59,0.0,0.0,0.0,0.0,0.0,9600.0,0.0,9000.0,0.0
Volkswagen,Golf,30900.0,0.0,0.0,8594.59,300.0,24000.0,0.0,9300.0,0.0,10300.0,...,30000.0,0.0,2600.0,0.0,2300.0,0.0,0.0,0.0,0.0,0.0
Volkswagen,Jetta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### save both the fines and owners dataframes to CSV files without an index

In [None]:
fines.to_csv('fines.csv', index=False)
owners.to_csv('owners.csv', index=False)