In [4]:
import joblib

import pandas as pd
import numpy as np

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [5]:
file_path = "E:/Python/3SP25/BIKE DETAILS.csv"
df = pd.read_csv(file_path)

df['seller_type'] = df['seller_type'].map({'Individual': 1, 'Dealer': 0})
df['owner'] = df['owner'].apply(lambda x: 1 if x in ['1st owner'] else 0)

valid_df = df.dropna(subset=["selling_price", "ex_showroom_price"]).copy()
valid_df["weight"] = valid_df["selling_price"] / valid_df["ex_showroom_price"]
avg = valid_df["weight"].mean()

mask = df["ex_showroom_price"].isna() & df["selling_price"].notna()
df.loc[mask, "ex_showroom_price"] = df.loc[mask, "selling_price"] / avg

df["name"], name_labels = pd.factorize(df["name"])
joblib.dump(name_labels, 'name_labels.pkl')

print(df.head())

output_path = "example.csv"
df.to_csv(output_path, index=False)
print("Done.")

   name  selling_price  year  seller_type  owner  km_driven  ex_showroom_price
0     0         175000  2019            1      1        350      267815.043790
1     1          45000  2017            1      1       5650       68866.725546
2     2         150000  2018            1      1      12000      148114.000000
3     3          65000  2015            1      1      23000       89643.000000
4     4          20000  2011            1      0      21000       30607.433576
Done.


In [7]:
df = pd.read_csv('E:/Python/3SP25/example.csv')

X = df.drop(columns=['selling_price'])
y = df['selling_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model = LinearRegression()
model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R2 Score: {r2}')

# joblib.dump(model, 'bike_price_model.pkl')

MSE: 233340199.70561096
R2 Score: 0.9115059582082943


In [8]:
df = pd.read_csv('E:/Python/3SP25/example.csv')

X = df.drop(columns=['selling_price'])
y = df['selling_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
# model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R2 Score: {r2}')

# joblib.dump(model, 'bike_price_model.pkl')

MSE: 272012036.9113196
R2 Score: 0.8968397018917161


In [None]:
model = joblib.load('bike_price_model.pkl')
name_labels = joblib.load('name_labels.pkl')

def preprocess_input(user_input):
    model_columns = ['name', 'year', 'seller_type', 'owner', 'km_driven', 'ex_showroom_price']

    df_input = pd.DataFrame([user_input])

    df_input['name'] = name_labels.get_indexer([df_input['name'][0]])

    for col in ['year', 'seller_type', 'owner', 'km_driven', 'ex_showroom_price']:
        df_input[col] = pd.to_numeric(df_input[col], errors='coerce')

    df_input = df_input.fillna(0)

    df_input = df_input[model_columns]

    return df_input

def predict_price(data_input):
    processed_input = preprocess_input(data_input)
    prediction = model.predict(processed_input)
    return prediction[0]

data_input = {
    'year': '2016',
    'km_driven': '21100',
    'name': 'Honda X-Blade',
    'ex_showroom_price': '40000000',
    'seller_type': 'Individual',
    'owner': '1st owner',
}

price = predict_price(data_input)
print(f"Giá xe dự đoán: {price:,.0f} VNĐ")