In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
# i want to see the numerical value of the data
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,5977.0,6019.0
mean,2013.358199,58738.38,5.278735,9.479468
std,3.269742,91268.84,0.80884,11.187917
min,1998.0,171.0,0.0,0.44
25%,2011.0,34000.0,5.0,3.5
50%,2014.0,53000.0,5.0,5.64
75%,2016.0,73000.0,5.0,9.95
max,2019.0,6500000.0,10.0,160.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [6]:
# as i see the data , the Engine , power , seats and mileage have the object data type 
# we need to work our way around these strings to extract the numerical data 

# lets start with the engine column
df['Engine'] = df['Engine'].apply(lambda x: x.replace('CC','') if isinstance(x,str) else x)   # remove the CC from the string and handle the error as there is Nan values 
# now the power column
df["Power"] = df['Power'].apply(lambda x: x.replace('bhp','') if isinstance(x,str) else x)   # remove the bhp from the string and handle the error as there is Nan values

# now the mileage column
def mileage_converter(value):
    if pd.isna(value):
        return np.nan
    
    value_str = str(value)
    if value_str.endswith('km/kg'):
        return float(value_str.replace('km/kg', '')) * 1.4
    elif value_str.endswith('kmpl'):
        return float(value_str.replace('kmpl', ''))
    else:
        return float(value)

# Apply the function to the 'Mileage' column
df['Mileage'] = df['Mileage'].apply(mileage_converter)
df.isna().sum()

    
    



Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [7]:
df.describe()
df['Engine'] = df['Engine'].astype(float)
df['Power'] = df['Power'].apply(lambda x: np.nan if str(x).strip() == 'null' else x).astype(float)
df.describe()


Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
count,6019.0,6019.0,6017.0,5983.0,5876.0,5977.0,6019.0
mean,2013.358199,58738.38,18.242474,1621.27645,113.25305,5.278735,9.479468
std,3.269742,91268.84,4.879633,601.355233,53.874957,0.80884,11.187917
min,1998.0,171.0,0.0,72.0,34.2,0.0,0.44
25%,2011.0,34000.0,15.26,1198.0,75.0,5.0,3.5
50%,2014.0,53000.0,18.19,1493.0,97.7,5.0,5.64
75%,2016.0,73000.0,21.1,1984.0,138.1,5.0,9.95
max,2019.0,6500000.0,46.956,5998.0,560.0,10.0,160.0


In [8]:
# now we fill the missing data with the appropriate values
df['Engine'] = df['Engine'].fillna(df['Engine'].median())
df['power'] = df['Power'].fillna(df['Power'].mean())
df['Seats'] = df['Seats'].fillna(df['Seats'].median())
df['Mileage'] = df['Mileage'].fillna(df['Mileage'].mean())
df.describe()
# now we converted all the possible numerical values into numeric
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,power
count,6019.0,6019.0,6019.0,6019.0,5876.0,6019.0,6019.0,6019.0
mean,2013.358199,58738.38,18.242474,1620.509221,113.25305,5.27679,9.479468,113.25305
std,3.269742,91268.84,4.878822,599.635458,53.874957,0.806346,11.187917,53.231019
min,1998.0,171.0,0.0,72.0,34.2,0.0,0.44,34.2
25%,2011.0,34000.0,15.26,1198.0,75.0,5.0,3.5,78.0
50%,2014.0,53000.0,18.19,1493.0,97.7,5.0,5.64,98.6
75%,2016.0,73000.0,21.1,1969.0,138.1,5.0,9.95,138.03
max,2019.0,6500000.0,46.956,5998.0,560.0,10.0,160.0,560.0


In [9]:
df['Owner_Type'].unique()

array(['First', 'Second', 'Fourth & Above', 'Third'], dtype=object)

In [10]:
# now we want to get the brand of the car to make it easier as encoding 
df["Brand"] = df["Name"].apply(lambda x: x.split()[0])
# now we drop the name column
df = df.drop(columns = ['Name'])
df = df.drop(columns = ['New_Price']) 
df = df.drop(columns = ['Power'])

# now to the owner type column 
def owner_type(value):
    if value == 'First':
        return 1
    elif value == 'Second':
        return 2
    elif value == 'Third':
        return 3
    elif value == 'Fourth & Above':
        return 4
    else:
        return 0
    
df['Owner_Type'] = df['Owner_Type'].apply(owner_type)  # simple encoding 



In [11]:
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Seats,Price,power
count,6019.0,6019.0,6019.0,6019.0,6019.0,6019.0,6019.0,6019.0
mean,2013.358199,58738.38,1.202858,18.242474,1620.509221,5.27679,9.479468,113.25305
std,3.269742,91268.84,0.456356,4.878822,599.635458,0.806346,11.187917,53.231019
min,1998.0,171.0,1.0,0.0,72.0,0.0,0.44,34.2
25%,2011.0,34000.0,1.0,15.26,1198.0,5.0,3.5,78.0
50%,2014.0,53000.0,1.0,18.19,1493.0,5.0,5.64,98.6
75%,2016.0,73000.0,1.0,21.1,1969.0,5.0,9.95,138.03
max,2019.0,6500000.0,4.0,46.956,5998.0,10.0,160.0,560.0


In [12]:
# now we will encode the location and the fuel type and transmission 
from sklearn.preprocessing import  MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
def preprocess_car_data(df):
    # Step 1: One-hot encoding for categorical columns
    categorical_columns = ['Location', 'Fuel_Type', 'Transmission']
    
    # Create a OneHotEncoder object
    encoder = OneHotEncoder(sparse_output=False)
    
    # Fit and transform the categorical columns
    encoded_data = encoder.fit_transform(df[categorical_columns])
    
    # Get the new column names
    encoded_columns = encoder.get_feature_names_out(categorical_columns)
    print(encoded_columns.shape)
    print(f"encode data {encoded_data.shape}")
    
    # Create a new DataFrame with encoded data
    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns, index=df.index)
    
    # Step 2: Standard scaling for numerical columns
    numerical_columns = ['Year', 'Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine', 'power', 'Seats', 'Price']
    
    # Create a StandardScaler object
    scaler = MinMaxScaler()
    
    # Fit and transform the numerical columns
    scaled_data = scaler.fit_transform(df[numerical_columns])
    
    # Create a new DataFrame with scaled data
    scaled_df = pd.DataFrame(scaled_data, columns=numerical_columns, index=df.index)
    
    # Step 3: Combine the encoded and scaled DataFrames
    processed_df = pd.concat([scaled_df, encoded_df], axis=1)
    
    return processed_df

# Assuming your DataFrame is named 'df'
processed_df = preprocess_car_data(df)

processed_df.head()

(18,)
encode data (6019, 18)


Unnamed: 0,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,power,Seats,Price,Location_Ahmedabad,Location_Bangalore,...,Location_Kolkata,Location_Mumbai,Location_Pune,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual
0,0.571429,0.011051,0.0,0.793083,0.156261,0.045569,0.5,0.00821,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.809524,0.006282,0.0,0.418903,0.254809,0.174971,0.5,0.075583,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.619048,0.007051,0.0,0.387597,0.190179,0.103652,0.5,0.025445,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.666667,0.013359,0.0,0.442329,0.198448,0.103766,0.7,0.034846,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.714286,0.006231,0.333333,0.323707,0.319946,0.202739,0.5,0.108423,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# now we have the data ready for the model building. which is the linear regression model
import mlflow
import mlflow.sklearn as mlflow_sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score

#connecting to the mlflow server
mlflow.set_tracking_uri("http://localhost:5000")

# using the mlflow
with mlflow.start_run():
    x = processed_df.drop(columns = ['Price'])
    y = processed_df['Price']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    # Log the model
    mlflow.log_metric("Mean Squared Error", mse)
    mlflow.log_metric("R-squared", r2)
    mlflow.sklearn.log_model(lr, "linear_regression")
