Goals: 
- Deal with null data points   
- Clean the data (getting rid of the units)
- Data split (train / test)
- Save the clean data in data/processed 

In [35]:
import pandas as pd

In [36]:
file_loc_1 = "../data/raw/Car details v3.csv"
df = pd.read_csv(file_loc_1)
df.dtypes

name              object
year               int64
selling_price      int64
km_driven          int64
fuel              object
seller_type       object
transmission      object
owner             object
mileage           object
engine            object
max_power         object
torque            object
seats            float64
dtype: object

In [37]:
df = df.dropna()
#df.info()
columns = list(df.columns)
categorical_features = ['fuel', 'seller_type', 'transmission', 'owner', 'seats']

for feature in categorical_features:
    print(f"{feature:<20} {df[feature].unique()}")

fuel                 ['Diesel' 'Petrol' 'LPG' 'CNG']
seller_type          ['Individual' 'Dealer' 'Trustmark Dealer']
transmission         ['Manual' 'Automatic']
owner                ['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']
seats                [ 5.  4.  7.  8.  6.  9. 10. 14.  2.]


In [38]:
df.head(10)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5.0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4.0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5.0


In [39]:
df["fuel"].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [40]:
df["fuel_Diesel"] = pd.get_dummies(df["fuel"])["Diesel"]

In [41]:
for feature in categorical_features:
    for category in df[feature].unique():
        df[f"{feature}_{category}"] = pd.get_dummies(df[feature])[category]

df.head(15)

#df.columns

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,...,owner_Test Drive Car,seats_5.0,seats_4.0,seats_7.0,seats_8.0,seats_6.0,seats_9.0,seats_10.0,seats_14.0,seats_2.0
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,...,False,True,False,False,False,False,False,False,False,False
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,...,False,True,False,False,False,False,False,False,False,False
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,...,False,True,False,False,False,False,False,False,False,False
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,...,False,True,False,False,False,False,False,False,False,False
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,...,False,True,False,False,False,False,False,False,False,False
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,...,False,True,False,False,False,False,False,False,False,False
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,...,False,True,False,False,False,False,False,False,False,False
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,...,False,False,True,False,False,False,False,False,False,False
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,...,False,True,False,False,False,False,False,False,False,False
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,...,False,True,False,False,False,False,False,False,False,False


In [42]:
df["engine"].apply(lambda string : string.split(" ")[1]).unique()

array(['CC'], dtype=object)

In [43]:
df["engine_value_cc"] = df["engine"].apply(lambda string : float(string.split(" ")[0]))

In [44]:
df.head(20)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,...,seats_5.0,seats_4.0,seats_7.0,seats_8.0,seats_6.0,seats_9.0,seats_10.0,seats_14.0,seats_2.0,engine_value_cc
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,...,True,False,False,False,False,False,False,False,False,1248.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,...,True,False,False,False,False,False,False,False,False,1498.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,...,True,False,False,False,False,False,False,False,False,1497.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,...,True,False,False,False,False,False,False,False,False,1396.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,...,True,False,False,False,False,False,False,False,False,1298.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,...,True,False,False,False,False,False,False,False,False,1197.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,...,True,False,False,False,False,False,False,False,False,1061.0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,...,False,True,False,False,False,False,False,False,False,796.0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,...,True,False,False,False,False,False,False,False,False,1364.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,...,True,False,False,False,False,False,False,False,False,1399.0


In [45]:
df["max_power"].apply(lambda string: string.split(" ")[1]).unique()

array(['bhp'], dtype=object)

In [46]:
df["max_power_bph"] = df["max_power"].apply(lambda string: float(string.split(" ")[0]))

In [47]:
df["mileage"].apply(lambda string : string.split(" ")[1]).unique()

array(['kmpl', 'km/kg'], dtype=object)