In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression
import time
from sklearn.model_selection import train_test_split

In [2]:
# Load the datasetata = "usedcars.csv"
data = 'cleaneddata.csv'
df = pd.read_csv(data)

In [3]:
# Convert boolean values in 'isOneOwner' column
df['isOneOwner'] = df['isOneOwner'].map({'f': 0, 't': 1})

In [4]:
categorical_columns = ['trim', 'color', 'fuel', 'region']
ordinal_encoder = OrdinalEncoder()
ordinal_encoded = ordinal_encoder.fit_transform(df[categorical_columns])
ordinal_encoded_df = pd.DataFrame(ordinal_encoded, columns=categorical_columns)
ordinal_encoded_df

Unnamed: 0,trim,color,fuel,region
0,0.0,0.0,1.0,6.0
1,0.0,5.0,1.0,2.0
2,0.0,4.0,1.0,2.0
3,3.0,3.0,1.0,2.0
4,3.0,3.0,1.0,6.0
...,...,...,...,...
20058,7.0,0.0,1.0,8.0
20059,2.0,0.0,2.0,6.0
20060,2.0,0.0,2.0,8.0
20061,2.0,4.0,2.0,6.0


In [5]:
# Merge one-hot encoded columns with the original dataframe
df = pd.concat([df.drop(categorical_columns, axis=1), ordinal_encoded_df], axis=1)

In [6]:
# Split dataset into features (X) and target (y)
X = df.drop(['price'], axis=1)
y = df['price']

In [7]:
# Create a Random Forest regressor and fit it to the data
rf = RandomForestRegressor()
rf.fit(X, y)

RandomForestRegressor()

In [8]:
# Print the feature importance scores
for i, feature in enumerate(X.columns):
    print(f'{feature}: {rf.feature_importances_[i]}')

isOneOwner: 0.002295652348513824
mileage: 0.14779406230999423
year: 0.8025725462060611
displacement: 0.005555288172719665
trim: 0.025485906556256
color: 0.006784533301012487
fuel: 0.0005332794467498618
region: 0.008978731658692733


In [9]:
# Create a Ridge regression estimator and fit it to the data
ridge = Ridge(alpha=1.0)
ridge.fit(X, y)

Ridge()

In [10]:
# Perform recursive feature elimination
rfe = RFE(estimator=ridge, n_features_to_select=5, step=1)
rfe.fit(X, y)

RFE(estimator=Ridge(), n_features_to_select=5)

In [11]:
# Print the selected features
selected_features = X.columns[rfe.support_]
print(selected_features)

Index(['isOneOwner', 'year', 'displacement', 'trim', 'fuel'], dtype='object')


In [12]:
# Univariate feature selection
skb = SelectKBest(f_regression, k=5)
skb_start = time.time()
X_new = skb.fit_transform(X, y)
skb_stop = time.time()

print(skb_stop-skb_start)

selected_columns = X.columns[skb.get_support()]
print("Selected columns using SelectKBest:", list(selected_columns))

0.011017084121704102
Selected columns using SelectKBest: ['isOneOwner', 'mileage', 'year', 'displacement', 'trim']


In [13]:
X_final = X[['isOneOwner', 'mileage', 'year', 'displacement', 'trim']]
X_final

Unnamed: 0,isOneOwner,mileage,year,displacement,trim
0,0,193.296,1995,3.2,0.0
1,0,129.948,1995,3.2,0.0
2,0,140.428,1997,3.2,0.0
3,0,113.622,1999,4.2,3.0
4,0,167.673,1999,4.2,3.0
...,...,...,...,...,...
20058,1,17.181,2013,4.6,7.0
20059,1,53.885,2010,3.5,2.0
20060,0,47.484,2010,3.5,2.0
20061,0,42.972,2010,3.5,2.0


In [18]:
new_df = pd.concat([df['price'], X_final], axis=1)
new_df

Unnamed: 0,price,isOneOwner,mileage,year,displacement,trim
0,2.988,0,193.296,1995,3.2,0.0
1,6.595,0,129.948,1995,3.2,0.0
2,7.993,0,140.428,1997,3.2,0.0
3,5.995,0,113.622,1999,4.2,3.0
4,3.000,0,167.673,1999,4.2,3.0
...,...,...,...,...,...,...
20058,67.950,1,17.181,2013,4.6,7.0
20059,31.995,1,53.885,2010,3.5,2.0
20060,34.995,0,47.484,2010,3.5,2.0
20061,38.991,0,42.972,2010,3.5,2.0


In [19]:
# Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)
train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)

In [20]:
# save the training and testing data to separate CSV files
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)