In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

In [99]:
df = pd.read_excel(r'../Datasets/Turbo_az_Kia_all_vehicles.xlsx')
df.head(2)

Unnamed: 0,City,Brand,Model,Year,Body Type,Color,Engine Details,Mileage,Transmission,Drive Type,İs_New?,Seat Count,Owner Count,Condition,Origin,Price,Product_link,Saler_name
0,Bakı,Kia,Cerato,2019,Sedan,Boz,2.0 L/150 a.g./Benzin,25 000 km,Avtomat,Ön,Xeyr,5,değer yok,"Vuruğu yoxdur, rənglənməyib",Amerika,30 500 AZN,https://turbo.az/autos/7577541-kia-cerato,"Avtosalon ""Eurocar"""
1,Bakı,Kia,Sorento,2015,Offroader / SUV,Qara,2.0 L/184 a.g./Dizel,44 000 km,Avtomat,Tam,Xeyr,değer yok,1,"Vuruğu yoxdur, rənglənməyib",Koreya,46 500 AZN,https://turbo.az/autos/7509258-kia-sorento,"Avtosalon ""AEN Cars"""


In [100]:
# Note: These steps assume 'Price' and 'Mileage' are initially strings and contain units
df['Price'] = df['Price'].str.extract('(\d+)', expand=False).astype(float)
df['Mileage'] = df['Mileage'].str.replace(' km', '').str.replace(' ', '').astype(float)

In [101]:
# Handling non-numeric entries for 'Owner Count'
df['Owner Count'] = pd.to_numeric(df['Owner Count'], errors='coerce')
df['Owner Count'].fillna(df['Owner Count'].median(), inplace=True)

In [102]:
# Feature Selection
features = ['Year', 'Mileage', 'Transmission', 'Drive Type', 'Body Type', 'Owner Count']
X = df[features]
y = df['Price']

In [103]:
# Encoding and Scaling
numeric_features = ['Year', 'Mileage', 'Owner Count']
categorical_features = ['Transmission', 'Drive Type', 'Body Type']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [104]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
# Model Pipeline
model = make_pipeline(preprocessor, LinearRegression())

In [106]:
# Fitting the Model
model.fit(X_train, y_train)

In [107]:
# Predictions
y_pred = model.predict(X_test)

In [108]:
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [109]:
# Output the performance metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 48.82219848450217
R-squared: 0.7449798198141178
