One-Hot Encoding


In [217]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, StratifiedKFold, LeaveOneOut, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV, LogisticRegressionCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.metrics import make_scorer, r2_score, mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [218]:
# Read the cleaned data from the CSV file
cleaned_df = pd.read_csv('cleaned_data.csv')

# Now you can use 'cleaned_df' for further data preparation
cleaned_df.head()

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,manufacture,engine,Seats,num_owners
0,Jeep Compass 2.0 Longitude Option BSIV,1003000.0,86226,Diesel,0,2017,1956.0,5,1
1,Renault Duster RXZ Turbo CVT,1283000.0,13248,Petrol,1,2021,1330.0,5,1
2,Toyota Camry 2.5 G,1640000.0,60343,Petrol,1,2016,2494.0,5,1
3,Honda Jazz VX CVT,777000.0,26696,Petrol,1,2018,1199.0,5,1
4,Volkswagen Polo 1.2 MPI Highline,515000.0,69414,Petrol,0,2016,1199.0,5,1


In [219]:
unique_fuel_types = cleaned_df['fuel_type'].unique()
print("Unique Fuel Types:", unique_fuel_types)

unique_transmissions = cleaned_df['transmission'].unique()
print("Unique Transmissions:", unique_transmissions)

Unique Fuel Types: ['Diesel' 'Petrol' 'Cng' 'Electric' 'Lpg']
Unique Transmissions: [0 1]


In [220]:
def one_hot_encode(df, columns_to_encode):
    df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)
    return df_encoded

# Define the columns to one-hot encode
columns_to_encode = ['fuel_type']

# Apply the one_hot_encode function
df_encoded = one_hot_encode(cleaned_df, columns_to_encode)

#prepared_df.head()
df_encoded.head(20)

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,transmission,manufacture,engine,Seats,num_owners,fuel_type_Diesel,fuel_type_Electric,fuel_type_Lpg,fuel_type_Petrol
0,Jeep Compass 2.0 Longitude Option BSIV,1003000.0,86226,0,2017,1956.0,5,1,1,0,0,0
1,Renault Duster RXZ Turbo CVT,1283000.0,13248,1,2021,1330.0,5,1,0,0,0,1
2,Toyota Camry 2.5 G,1640000.0,60343,1,2016,2494.0,5,1,0,0,0,1
3,Honda Jazz VX CVT,777000.0,26696,1,2018,1199.0,5,1,0,0,0,1
4,Volkswagen Polo 1.2 MPI Highline,515000.0,69414,0,2016,1199.0,5,1,0,0,0,1
5,Volkswagen Vento 1.2 TSI Highline AT,766000.0,49719,1,2017,1197.0,5,1,0,0,0,1
6,Volkswagen Vento 1.2 TSI Highline Plus AT,758000.0,43688,1,2017,1197.0,5,1,0,0,0,1
7,Honda WR-V VX Diesel,1160000.0,14470,0,2021,1498.0,5,1,1,0,0,0
8,Honda City i VTEC CVT SV,699000.0,21429,1,2015,1497.0,5,1,0,0,0,1
9,Renault Duster Petrol RXS CVT,753000.0,31750,1,2017,1498.0,5,1,0,0,0,1


# Models

In [226]:
X = df_encoded.drop('car_prices_in_rupee', axis = 1) # Selecting independent features 
y = df_encoded['car_prices_in_rupee'] # Selecting target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [230]:
# Linear Regression
def linear_regression_train(X,y,df_t):
    model = LinearRegression()
    model.fit(X, y)
    y_predic = model.predict(X)
    y_test = model.predict(df_t)
    mse = mean_squared_error(y, y_predic)
    mse_std = np.std(y-y_predic)
    r2 = r2_score(y, y_predic)   
    return mse,mse_std, r2, y_test

# Polynomial Regression
def non_linear_regression_train(X,y,df_t,degree=2):
    poly_features = PolynomialFeatures(degree=degree)
    X_poly = poly_features.fit_transform(X)
    t_poly = poly_features.fit_transform(df_t)
    poly_model = LinearRegression()
    poly_model.fit(X_poly, y)
    y_test = poly_model.predict(t_poly)
    y_poly_pred = poly_model.predict(X_poly)
    poly_mse = mean_squared_error(y, y_poly_pred)
    mse_std = np.std(y-y_poly_pred)
    r2 = r2_score(y, y_poly_pred)
    return poly_mse,mse_std, r2, y_test

# Lasso Regression
def lasso_regression_train(X,y,df_t,alpha):
    # alpha between 0 and 1
    model = Lasso(alpha)
    model.fit(X, y)
    y_predic = model.predict(X)
    y_test = model.predict(df_t)
    mse = mean_squared_error(y, y_predic)
    r2 = r2_score(y, y_predic)
    return mse, r2, model, y_test

def lasso_regression_train_cv(X,y,cv,df_t):
    lasso_model = LassoCV(alphas=None, cv=cv, max_iter=100000)
    lasso_model.fit(X, y)
    lasso = Lasso(alpha=lasso_model.alpha_)
    lasso.fit(X, y)
    lasso_y_predic = lasso.predict(X)
    y_pred = lasso.predict(df_t)
    mse = mean_squared_error(y, lasso_y_predic)
    mse_std = np.std(y-lasso_y_predic)
    r2 = r2_score(y, lasso_y_predic)
    return mse,mse_std,r2,y_pred

# Polynomial Lasso Regression
def ploy_lasso_regression_train_cv(X,y,cv,df_t, degree=2):
    scaler = StandardScaler()
    standarized_x = scaler.fit_transform(X)
    ploy_model = LassoCV(alphas=None, cv=cv, max_iter=100000).fit(standarized_x, y)
    poly_lasso = Lasso(alpha=ploy_model.alpha_)
    poly_lasso.fit(X, y)
    poly_lasso_y_predic = poly_lasso.predict(X)
    y_pred = poly_lasso.predict(df_t)
    mse = mean_squared_error(y, poly_lasso_y_predic)
    mse_std = np.std(y-poly_lasso_y_predic)
    r2 = r2_score(y, poly_lasso_y_predic)
    return mse,mse_std,r2,y_pred

# Ridge Regression
def ridge_regression_train(X,y,df_t, alpha):
    # alpha 0 and Inf
    model = Ridge(alpha)
    model.fit(X, y)
    y_predic = model.predict(X)
    y_test = model.predict(df_t)
    mse = mean_squared_error(y, y_predic)
    r2 = r2_score(y, y_predic)
    return mse, r2, model, y_test

def ridge_regression_train_cv(X,y,cv,alpha_range_ridge,df_t):
    ridge_model = RidgeCV(alphas=alpha_range_ridge, scoring= 'neg_mean_squared_error', cv=cv)
    ridge_model.fit(X, y)
    ridge = Ridge(alpha=ridge_model.alpha_)
    ridge.fit(X, y)
    ridge_y_predic = ridge.predict(X)
    y_pred = ridge.predict(df_t)
    mse = mean_squared_error(y, ridge_y_predic)
    mse_std = np.std(y-ridge_y_predic)
    r2 = r2_score(y, ridge_y_predic)
    return mse,mse_std,r2,y_pred

# Polynomial Ridge Regression
def poly_ridge_regression_train_cv(X,y,alphas_ridge,cv,df_t,degree=2):
    scaler = StandardScaler()
    standarized_x = scaler.fit_transform(X)
    ploy_model = RidgeCV(alphas=alphas_ridge,scoring= 'neg_mean_squared_error', cv=cv).fit(standarized_x, y)
    poly_ridge = Ridge(alpha=ploy_model.alpha_)
    poly_ridge.fit(X, y)
    poly_ridge_y_predic = poly_ridge.predict(X)
    y_pred = poly_ridge.predict(df_t)
    mse = mean_squared_error(y, poly_ridge_y_predic)
    mse_std = np.std(y-poly_ridge_y_predic)
    r2 = r2_score(y, poly_ridge_y_predic)
    return mse,mse_std,r2,y_pred

# Gradient Boosting Regression
def gradient_boosting_train(X,y,df_t):
    model = HistGradientBoostingRegressor()
    model.fit(X, y)
    y_predic = model.predict(X)
    y_test = model.predict(df_t)
    mse = mean_squared_error(y, y_predic)
    r2 = r2_score(y, y_predic)
    return mse, r2, model, y_test

# Function that categorizes the columns of a dataframe into continuous, binary and string
def categorize_columns(df):
    categorized_columns = {
        'continuous': [],
        'binary': [],
        'string': []
    }
    
    for column in df.columns:
        unique_values = df[column].dropna().unique()
        if len(unique_values) <= 2 and (0 in unique_values or 1 in unique_values or True in unique_values or False in unique_values):
            categorized_columns['binary'].append(column)
        elif df[column].dtype in ['int64', 'float64','Int64']:
            categorized_columns['continuous'].append(column)
        else:
            categorized_columns['string'].append(column)
    
    return categorized_columns

# Function to evaluate the models
def evaluate(model, X, y, cv):
    cv_results = cross_val_score(model, X, y, cv=cv, scoring=['test_neg_mean_squared_error','test_r2'], return_estimator=True)
    mae = -cv_results['test_neg_mean_squared_error']
    r2 = cv_results['test_r2']
    eval_model = cv_results['estimator']
    print(
        f"Mean Absolute Error:      {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"R2:                       {r2.mean():.3f} +/- {r2.std():.3f}"
    )
    return eval_model

# Function to run a cross validation
def cross_val(split):
    if split == 1:
        cross = KFold(n_splits=3, shuffle=True, random_state=42)
    elif split == 2:
        cross = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    elif split == 3:
        cross = KFold(n_splits=6, shuffle=True, random_state=42)
    elif split == 4:
        cross = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)
    #elif split == 5:
        #cross = LeaveOneOut()
    else:
        cross = None
    return cross
        


# ED's PART

In [None]:
# Split the data into train and test sets
X = prepared_df.drop('car_prices_in_rupee', axis=1)
y = prepared_df['car_prices_in_rupee']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the train and test datasets
print("X_train:")
print(X_train.head())

print("\nX_test:")
print(X_test.head())

print("\ny_train:")
print(y_train.head())

print("\ny_test:")
print(y_test.head())

X_train:
                                     car_name  kms_driven fuel_type  \
1839                 Jeep Compass 2.0 Limited       98492    Diesel   
4826                         Maruti Swift VXI        6356    Petrol   
957                            Maruti Zen LXI      100000    Petrol   
1158                Volvo XC60 Inscription D5        7500    Diesel   
1215  Mercedes-Benz C-Class Progressive C 200       16500    Petrol   

     transmission  manufacture  engine  Seats  num_owners  \
1839       Manual         2018  1968.0      5           1   
4826       Manual         2021  1248.0      5           1   
957        Manual         2000  1396.0      5           2   
1158    Automatic         2020  1798.0      5           1   
1215    Automatic         2018  1998.0      5           1   

                                     car_name  kms_driven  manufacture  \
1839                 Jeep Compass 2.0 Limited       98492         2018   
4826                         Maruti Swift VXI    

In [None]:
# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the model's coefficients and performance metrics
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)

ValueError: could not convert string to float: 'Jeep Compass 2.0 Limited'