In [1]:
import pandas as pd

## read the CSV file and make ID the index column

In [2]:
df = pd.read_csv('../data/auto.csv', index_col='ID')
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


## count the number of observations

In [3]:
df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## drop the duplicates, taking into account only the following columns: CarNumber, Make_n_model, Fines

In [4]:
df_cleaned=df.copy().drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'],
                  keep='last')
df_cleaned

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
5,92918M178RUS,Ford Focus,1.0,5700.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


In [5]:
df_cleaned.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

## work with missing values

In [6]:
df_cleaned.isnull().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [7]:
df_cleaned.dropna(axis='columns',thresh=500,inplace=True)
df_cleaned

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0
1,E432XX77RUS,Toyota Camry,1.0,6500.0
2,7184TT36RUS,Ford Focus,1.0,2100.0
3,X582HE161RUS,Ford Focus,2.0,2000.0
5,92918M178RUS,Ford Focus,1.0,5700.0
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0
927,M0309X197RUS,Ford Focus,1.0,22300.0
928,O673E8197RUS,Ford Focus,2.0,600.0
929,8610T8154RUS,Ford Focus,1.0,2000.0


In [8]:
df_cleaned['Refund'] = df_cleaned['Refund'].ffill()
df_cleaned.isnull().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [9]:
mean_fines = df_cleaned['Fines'].mean()
df_cleaned['Fines'] = df_cleaned['Fines'].fillna(value = mean_fines)
df_cleaned.isnull().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## split and parse the name and model

In [10]:
df_cleaned[['Make', 'Model']] = df_cleaned['Make_n_model'].apply(lambda x: pd.Series(x.split(' ', 1)))
df_cleaned.drop(columns=['Make_n_model'], inplace=True)
df_cleaned.to_json('../data/auto.json', orient='records', lines=False)

In [11]:
df_cleaned['Fines'].mean()

8594.586466165412

In [12]:
df_cleaned['Refund'].mean()

1.5172413793103448

In [13]:
df_cleaned.count()

CarNumber    725
Refund       725
Fines        725
Make         725
Model        716
dtype: int64