Import Packages


In [308]:
import pandas as pd
import numpy as np


Transform into Pandas Dataframe

In [309]:
df = pd.read_csv('car_price.csv')
print(df.head())
print(df.info())

   Unnamed: 0                                car_name car_prices_in_rupee  \
0           0  Jeep Compass 2.0 Longitude Option BSIV          10.03 Lakh   
1           1            Renault Duster RXZ Turbo CVT          12.83 Lakh   
2           2                      Toyota Camry 2.5 G          16.40 Lakh   
3           3                       Honda Jazz VX CVT           7.77 Lakh   
4           4        Volkswagen Polo 1.2 MPI Highline           5.15 Lakh   

   kms_driven fuel_type transmission  ownership  manufacture   engine    Seats  
0  86,226 kms    Diesel       Manual  1st Owner         2017  1956 cc  5 Seats  
1  13,248 kms    Petrol    Automatic  1st Owner         2021  1330 cc  5 Seats  
2  60,343 kms    Petrol    Automatic  1st Owner         2016  2494 cc  5 Seats  
3  26,696 kms    Petrol    Automatic  1st Owner         2018  1199 cc  5 Seats  
4  69,414 kms    Petrol       Manual  1st Owner         2016  1199 cc  5 Seats  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5

In [310]:
df = df.drop(['Unnamed: 0'], axis=1)    # drop the unamed column
print(df.head())

                                 car_name car_prices_in_rupee  kms_driven  \
0  Jeep Compass 2.0 Longitude Option BSIV          10.03 Lakh  86,226 kms   
1            Renault Duster RXZ Turbo CVT          12.83 Lakh  13,248 kms   
2                      Toyota Camry 2.5 G          16.40 Lakh  60,343 kms   
3                       Honda Jazz VX CVT           7.77 Lakh  26,696 kms   
4        Volkswagen Polo 1.2 MPI Highline           5.15 Lakh  69,414 kms   

  fuel_type transmission  ownership  manufacture   engine    Seats  
0    Diesel       Manual  1st Owner         2017  1956 cc  5 Seats  
1    Petrol    Automatic  1st Owner         2021  1330 cc  5 Seats  
2    Petrol    Automatic  1st Owner         2016  2494 cc  5 Seats  
3    Petrol    Automatic  1st Owner         2018  1199 cc  5 Seats  
4    Petrol       Manual  1st Owner         2016  1199 cc  5 Seats  


Detect Missing Values

In [311]:
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
car_name               0
car_prices_in_rupee    0
kms_driven             0
fuel_type              0
transmission           0
ownership              0
manufacture            0
engine                 0
Seats                  0
dtype: int64


Convert Car Prices to a Nominal Amount

In [312]:
def convert_price(price_str):
    try:
        num_value = float(price_str.split(" ")[0])

        return num_value*100000
    except ValueError:
        return None
df['car_prices_in_rupee'] = df['car_prices_in_rupee'].apply(convert_price)
print(df.head())
        

                                 car_name  car_prices_in_rupee  kms_driven  \
0  Jeep Compass 2.0 Longitude Option BSIV            1003000.0  86,226 kms   
1            Renault Duster RXZ Turbo CVT            1283000.0  13,248 kms   
2                      Toyota Camry 2.5 G            1640000.0  60,343 kms   
3                       Honda Jazz VX CVT             777000.0  26,696 kms   
4        Volkswagen Polo 1.2 MPI Highline             515000.0  69,414 kms   

  fuel_type transmission  ownership  manufacture   engine    Seats  
0    Diesel       Manual  1st Owner         2017  1956 cc  5 Seats  
1    Petrol    Automatic  1st Owner         2021  1330 cc  5 Seats  
2    Petrol    Automatic  1st Owner         2016  2494 cc  5 Seats  
3    Petrol    Automatic  1st Owner         2018  1199 cc  5 Seats  
4    Petrol       Manual  1st Owner         2016  1199 cc  5 Seats  


In [313]:
def convert_engine(cc_str):
    try:
        num_value = float(cc_str.split(" ")[0])
        return num_value
    except ValueError:
        return None
df['engine'] = df['engine'].apply(convert_engine)
print(df.head())          

                                 car_name  car_prices_in_rupee  kms_driven  \
0  Jeep Compass 2.0 Longitude Option BSIV            1003000.0  86,226 kms   
1            Renault Duster RXZ Turbo CVT            1283000.0  13,248 kms   
2                      Toyota Camry 2.5 G            1640000.0  60,343 kms   
3                       Honda Jazz VX CVT             777000.0  26,696 kms   
4        Volkswagen Polo 1.2 MPI Highline             515000.0  69,414 kms   

  fuel_type transmission  ownership  manufacture  engine    Seats  
0    Diesel       Manual  1st Owner         2017  1956.0  5 Seats  
1    Petrol    Automatic  1st Owner         2021  1330.0  5 Seats  
2    Petrol    Automatic  1st Owner         2016  2494.0  5 Seats  
3    Petrol    Automatic  1st Owner         2018  1199.0  5 Seats  
4    Petrol       Manual  1st Owner         2016  1199.0  5 Seats  


In [314]:
def convert_kms(kms_str):
    try:
        # Remove commas and 'kms', then convert to integer
        return int(kms_str.replace(',', '').replace(' kms', ''))
    except ValueError:
        return None

# Apply the conversion function to the 'kms_driven' column
df['kms_driven'] = df['kms_driven'].apply(convert_kms)

# Display the modified DataFrame
print(df.head())


                                 car_name  car_prices_in_rupee  kms_driven  \
0  Jeep Compass 2.0 Longitude Option BSIV            1003000.0       86226   
1            Renault Duster RXZ Turbo CVT            1283000.0       13248   
2                      Toyota Camry 2.5 G            1640000.0       60343   
3                       Honda Jazz VX CVT             777000.0       26696   
4        Volkswagen Polo 1.2 MPI Highline             515000.0       69414   

  fuel_type transmission  ownership  manufacture  engine    Seats  
0    Diesel       Manual  1st Owner         2017  1956.0  5 Seats  
1    Petrol    Automatic  1st Owner         2021  1330.0  5 Seats  
2    Petrol    Automatic  1st Owner         2016  2494.0  5 Seats  
3    Petrol    Automatic  1st Owner         2018  1199.0  5 Seats  
4    Petrol       Manual  1st Owner         2016  1199.0  5 Seats  


In [315]:
def convert_seats(seats_str):
    try:
        return int(seats_str.replace(' Seats', ''))
    except ValueError:
        return None
df['Seats'] = df['Seats'].apply(convert_seats)
print(df.head())

                                 car_name  car_prices_in_rupee  kms_driven  \
0  Jeep Compass 2.0 Longitude Option BSIV            1003000.0       86226   
1            Renault Duster RXZ Turbo CVT            1283000.0       13248   
2                      Toyota Camry 2.5 G            1640000.0       60343   
3                       Honda Jazz VX CVT             777000.0       26696   
4        Volkswagen Polo 1.2 MPI Highline             515000.0       69414   

  fuel_type transmission  ownership  manufacture  engine  Seats  
0    Diesel       Manual  1st Owner         2017  1956.0      5  
1    Petrol    Automatic  1st Owner         2021  1330.0      5  
2    Petrol    Automatic  1st Owner         2016  2494.0      5  
3    Petrol    Automatic  1st Owner         2018  1199.0      5  
4    Petrol       Manual  1st Owner         2016  1199.0      5  


In [316]:
def extract_first_integer(ownership_str):
    try:
        return int(''.join(filter(str.isdigit, ownership_str)))
    except ValueError:
        return None

def preprocess_data(df):
    # Create a new column 'num_users' by applying the extract_first_integer function
    df['num_owners'] = df['ownership'].apply(extract_first_integer)

    # Drop the original 'ownership' column
    df.drop(columns=['ownership'], inplace=True)

    return df

# Call the preprocess_data function
df = preprocess_data(df)

# Display the modified DataFrame
#df = df.sort_values(by='num_owners', ascending=False)
#print(df)

print(df.head())

                                 car_name  car_prices_in_rupee  kms_driven  \
0  Jeep Compass 2.0 Longitude Option BSIV            1003000.0       86226   
1            Renault Duster RXZ Turbo CVT            1283000.0       13248   
2                      Toyota Camry 2.5 G            1640000.0       60343   
3                       Honda Jazz VX CVT             777000.0       26696   
4        Volkswagen Polo 1.2 MPI Highline             515000.0       69414   

  fuel_type transmission  manufacture  engine  Seats  num_owners  
0    Diesel       Manual         2017  1956.0      5           1  
1    Petrol    Automatic         2021  1330.0      5           1  
2    Petrol    Automatic         2016  2494.0      5           1  
3    Petrol    Automatic         2018  1199.0      5           1  
4    Petrol       Manual         2016  1199.0      5           1  


In [317]:
df.to_csv('cleaned_data.csv', index=False)