In [1]:
import pandas as pd
import numpy as np

# Column names as given
columns = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors","body_style",
           "drive_wheels","engine_location","wheel_base","length","width","height","curb_weight",
           "engine_type","num_cylinders","engine_size","fuel_system","bore","stroke","compression_ratio",
           "horsepower","peak_rpm","city_mpg","highway_mpg","price"]

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=columns)

# Replace '?' with NaN
df.replace("?", np.nan, inplace=True)


In [3]:
df.head(3)

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500


In [7]:
# Numeric (continuous) columns → fill with mean
df.fillna({col: df[col].mean() for col in numeric_cols if col != "price"}, inplace=True)

# Categorical columns → fill with mode
df.fillna({col: df[col].mode()[0] for col in categorical_cols}, inplace=True)

# Drop rows where price is missing
df.dropna(subset=["price"], inplace=True)



In [11]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [13]:
from sklearn.preprocessing import LabelEncoder

# (i) Convert num_doors and num_cylinders words to numbers
door_map = {"two":2, "four":4}
cyl_map = {"two":2, "three":3, "four":4, "five":5, "six":6, "eight":8, "twelve":12}

df["num_doors"] = df["num_doors"].map(door_map)
df["num_cylinders"] = df["num_cylinders"].map(cyl_map)




In [15]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [17]:
 # (ii) Dummy encoding
df = pd.get_dummies(df, columns=["body_style","drive_wheels"], drop_first=True)



In [19]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,peak_rpm,city_mpg,highway_mpg,price,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_fwd,drive_wheels_rwd
0,3,122.0,alfa-romero,gas,std,2,front,88.6,168.8,64.1,...,5000.0,21,27,13495.0,False,False,False,False,False,True
1,3,122.0,alfa-romero,gas,std,2,front,88.6,168.8,64.1,...,5000.0,21,27,16500.0,False,False,False,False,False,True
2,1,122.0,alfa-romero,gas,std,2,front,94.5,171.2,65.5,...,5000.0,19,26,16500.0,False,True,False,False,False,True
3,2,164.0,audi,gas,std,4,front,99.8,176.6,66.2,...,5500.0,24,30,13950.0,False,False,True,False,True,False
4,2,164.0,audi,gas,std,4,front,99.4,176.6,66.4,...,5500.0,18,22,17450.0,False,False,True,False,False,False


In [21]:
# (iii) Label encoding
for col in ["make","aspiration","engine_location","fuel_type"]:
    df[col] = LabelEncoder().fit_transform(df[col])



In [23]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,peak_rpm,city_mpg,highway_mpg,price,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_fwd,drive_wheels_rwd
0,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,5000.0,21,27,13495.0,False,False,False,False,False,True
1,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,5000.0,21,27,16500.0,False,False,False,False,False,True
2,1,122.0,0,1,0,2,0,94.5,171.2,65.5,...,5000.0,19,26,16500.0,False,True,False,False,False,True
3,2,164.0,1,1,0,4,0,99.8,176.6,66.2,...,5500.0,24,30,13950.0,False,False,True,False,True,False
4,2,164.0,1,1,0,4,0,99.4,176.6,66.4,...,5500.0,18,22,17450.0,False,False,True,False,False,False


In [25]:
# (iv) Fuel system → "pfi" = 1, else 0
df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if x=="pfi" else 0)

# (v) Engine type → "ohc" = 1, else 0
df["engine_type"] = df["engine_type"].apply(lambda x: 1 if x=="ohc" else 0)

In [27]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,peak_rpm,city_mpg,highway_mpg,price,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_fwd,drive_wheels_rwd
0,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,5000.0,21,27,13495.0,False,False,False,False,False,True
1,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,5000.0,21,27,16500.0,False,False,False,False,False,True
2,1,122.0,0,1,0,2,0,94.5,171.2,65.5,...,5000.0,19,26,16500.0,False,True,False,False,False,True
3,2,164.0,1,1,0,4,0,99.8,176.6,66.2,...,5500.0,24,30,13950.0,False,False,True,False,True,False
4,2,164.0,1,1,0,4,0,99.4,176.6,66.4,...,5500.0,18,22,17450.0,False,False,True,False,False,False


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 0 to 204
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   symboling             201 non-null    int64  
 1   normalized_losses     201 non-null    float64
 2   make                  201 non-null    int32  
 3   fuel_type             201 non-null    int32  
 4   aspiration            201 non-null    int32  
 5   num_doors             201 non-null    int64  
 6   engine_location       201 non-null    int32  
 7   wheel_base            201 non-null    float64
 8   length                201 non-null    float64
 9   width                 201 non-null    float64
 10  height                201 non-null    float64
 11  curb_weight           201 non-null    int64  
 12  engine_type           201 non-null    int64  
 13  num_cylinders         201 non-null    int64  
 14  engine_size           201 non-null    int64  
 15  fuel_system           201 no

In [31]:
X = df.drop("price", axis=1).values
y = df["price"].values


In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score




In [35]:
# Split (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale inputs
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Evaluate
y_pred = lr.predict(X_test_scaled)
print("R2 Score:", r2_score(y_test, y_pred))

R2 Score: 0.8771068326600799


In [37]:
from sklearn.decomposition import PCA


In [39]:
 # Apply PCA (keep 95% variance)
pca = PCA(0.95)
X_reduced = pca.fit_transform(X)

# Split reduced dataset
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

# Train again
lr_pca = LinearRegression()
lr_pca.fit(X_train_r, y_train_r)

y_pred_r = lr_pca.predict(X_test_r)
print("R2 Score after PCA:", r2_score(y_test_r, y_pred_r))

R2 Score after PCA: 0.7039833610028075
