Step 1: Data Cleaning & Feature Engineering

Data Cleaning

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
# substitute with your path to the folder
data = pd.read_csv('/home/lola/machine-learning-project/data.csv')

In [None]:
clean_data = data.copy()
clean_data = clean_data.drop_duplicates() # remove duplicates
clean_data = clean_data.drop('Market Category', axis = 1) # remove column 'Market Category'

# fill null values in columns 'Engine HP' and 'Engine Cylinders' for electric cars to 'not applicable'
clean_data.loc[clean_data['Engine Fuel Type'] == 'electric', 'Engine HP'] = 'not applicable'
clean_data.loc[clean_data['Engine Fuel Type'] == 'electric', 'Engine Cylinders'] = 'not applicable'

# add column 'Power' for electric cars and initialise it with null values
clean_data['Power'] = np.nan
# fill null values in column 'Power' for non-electric cars with 'not applicable'
clean_data.loc[clean_data['Engine Fuel Type'] != 'electric', 'Power'] = 'not applicable'

# check for identical cars with different price and popularity rating
columns_to_check  = ['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Driven_Wheels', 'Number of Doors', 'Vehicle Size', 'Vehicle Style', 'highway MPG', 'city mpg', 'Power']
columns_to_ignore = ['MSRP', 'Popularity']
duplicates = clean_data.duplicated(subset=columns_to_check, keep=False)

# calculate the mean value for price and popularity for identical cars and substitute the multiple identical rows with the summarized rows
duplicate_rows = clean_data[duplicates]
summary = duplicate_rows.groupby(columns_to_check)[columns_to_ignore].mean().reset_index()
clean_data = clean_data.drop_duplicates(subset=clean_data.columns.difference(columns_to_ignore))
clean_data = pd.concat([clean_data, summary], ignore_index=True)

Disclaimer: Please do not overwrite clean_data.csv because of manual changes made to fill the null-values for the electric cars with actual (researched) power values

In [None]:
clean_data.to_csv('clean_data.csv', index= False)

In [4]:
data = pd.read_csv('/home/lola/machine-learning-project/clean_data.csv')

# remove actual null values in the data
data = data.dropna()

workable_data = data.copy()

# substitute the values 'not applicable' with null values
workable_data.loc[workable_data['Engine HP'] == 'not applicable', 'Engine HP'] = pd.NA
workable_data['Engine HP'] = pd.to_numeric(workable_data['Engine HP'], errors='coerce')
workable_data.loc[workable_data['Engine Cylinders'] == 'not applicable', 'Engine Cylinders'] = pd.NA
workable_data['Engine Cylinders'] = pd.to_numeric(workable_data['Engine Cylinders'], errors='coerce')
workable_data.loc[workable_data['Power'] == 'not applicable', 'Power'] = pd.NA
workable_data['Power'] = pd.to_numeric(workable_data['Power'], errors='coerce')

Feature Engineering

In [21]:
categorical_features = ['Make', 'Model', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Vehicle Size', 'Vehicle Style', 'Engine Cylinders', 'Number of Doors', 'Year']
numerical_features = ['Engine HP','highway MPG', 'city mpg', 'Popularity', 'Power']

target_variable = 'MSRP'

# binary encoding for categorical features
label_encoders = {}
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    workable_data[feature] = label_encoders[feature].fit_transform(workable_data[feature])
    workable_data[feature] = workable_data[feature].apply(lambda x: format(x, 'b'))
    workable_data[feature] = workable_data[feature].apply(lambda x: [int(i) for i in x])

# standardization for numerical features
scaler = StandardScaler()    
for feature in numerical_features:
    workable_data[feature] = scaler.fit_transform(workable_data[[feature]])

for feature in categorical_features:
    workable_data[feature] = workable_data[feature].apply(lambda x: ''.join(str(i) for i in x))

preprocessed_data = pd.DataFrame(workable_data)

preprocessed_data.to_csv('binarized_standardized_data.csv', index=False)

In [8]:
import pandas as pd
from prince import FAMD

data = pd.read_csv('/home/lola/machine-learning-project/binarized_standardized_data.csv')

# fill null values with a extreme value that will be recognized as invalid a by the model
data = data.fillna(-9999)

target = data['MSRP']
features = data.drop(columns=['MSRP'])

# transform categorical features to the datatype 'category'
categorical_features = ['Make', 'Model', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Vehicle Size', 'Vehicle Style', 'Engine Cylinders', 'Number of Doors', 'Year']
for col in categorical_features:
    data[col] = data[col].astype('category')

# define desired dimensions
famd = FAMD(n_components=3)

# dimension reduction of the data features with famd
famd.fit(features)
reduced_data = famd.transform(features)
reduced_data_with_target = pd.concat([pd.DataFrame(reduced_data), target], axis=1)

# save as csv
reduced_data_with_target.to_csv('famd_data.csv', index=False)

  X = self.scaler_.transform(X.to_numpy())
  X = self.scaler_.transform(X.to_numpy())
