In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import warnings

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

from sklearn.preprocessing import PolynomialFeatures
import re

def extract_engine_features(engine_str):
    # Extract displacement
#     displacement = re.search(r"(\d+\.\d+)L", engine_str)
    # engine_displacement = float(displacement.group(1)) if displacement else None

    # Extract horsepower
    horsepower = re.search(r"(\d+\.\d+)HP", engine_str)
    engine_horsepower = float(horsepower.group(1)) if horsepower else None

    # # Extract number of cylinders
    # num_cylinders = re.search(r"\d+\s?Cylinder", engine_str)
    # num_cylinders = int(num_cylinders.group().split()[0]) if num_cylinders else None

    # # Extract cylinder configuration
    # cylinder_config = "V" if "V" in engine_str else "Straight" if "Straight" in engine_str else None

    # # # Extract fuel type
    # # fuel_type = re.search(r"(Gasoline|Diesel|Electric|Flex Fuel)", engine_str)
    # # fuel_type = fuel_type.group(1) if fuel_type else None

    return engine_horsepower #, num_cylinders, cylinder_config, engine_displacement

def extract_engine_features_2(engine_str):
    """
    This Functipn Extracts the following features from the engine column:
    - Displacement
    - Engine Config
        1. V-shaped
        2. Straight or inline
        3. Rotary
        4. Flat
        5. Other, for electric car which do not have engines 
    - Number of Cylinders
    - PDI(Port Direct Injection)
    - DOHC(Double OverHead Camshaft)
    - SOHC(Single OverHead Camshaft)
    - Turbo, categorical data with Three values Turbo=1. Twin Turbo = 2 and None  = 0
    - GDI(Gasoline Direct injection)
    - MPFI (Multi Point Injection)
    """
    features = {}

    # Displacement
    displacement_match = re.search(r'(\d+(?:\.\d+)?)\s*L(?:iter)?', engine_str)
    if 'electric' in engine_str or 'Electric' in engine_str and 'hybrid' not in engine_str: # Check if car is solely electric
        features['displacement'] = float(0)
    else:
        features['displacement'] = float(displacement_match.group(1)) if displacement_match else None

    # Engine Configuration
    config_match = re.search(r'(V\d+|I\d+|Straight|Rotary|Flat)', engine_str, re.IGNORECASE)
    if 'electric' in engine_str or 'Electric' in engine_str and 'hybrid' not in engine_str: # check if car is solely electric
        features['Engine_Config'] = 'electric'
    elif config_match:
        config = config_match.group(1).lower()
        if config.startswith('i'):
            features['Engine_Config'] = 'straight'
        elif config.startswith('v'):
            features['Engine_Config'] = 'v'
        else:
            features['Engine_Config'] = config
    else:
        features['Engine_Config'] = 'Unknown'

    # Number of cylinders
    cylinder_match = re.search(r'(?:V|I)(\d+)|(\d+)\s*Cylinder', engine_str)
    if 'electric' in engine_str or 'Electric' in engine_str and 'hybrid' not in engine_str: # check if car is solely electric
        features['Num_Cylinders'] = int(0)
    elif cylinder_match:
        features['Num_Cylinders'] = cylinder_match.group(1) or cylinder_match.group(2) if cylinder_match else None
    else:
        features['Num_Cylinders'] = None

    # Categorical Features
    features['PDI'] =  True if 'PDI' in engine_str else False
    features['DOHC'] = True if 'DOHC' in engine_str else False
    features['SOHC'] = True if 'SOHC' in engine_str else False
    features['OHV'] =  True if 'OHV' in engine_str else False
    features['MPFI'] = True if 'MPFI' in engine_str else False 

    # Turbo 
    if 'Twin Turbo' in engine_str:
        features['Turbo'] = 2
    elif 'Turbo' in engine_str and 'Twin' not in engine_str:
        features['Turbo'] = 1
    else:
        features['Turbo'] = 0
    
    return features


def extract_all_features(data):
    extracted_features = data['engine'].apply(extract_engine_features_2)
    return pd.DataFrame(extracted_features.tolist())


engine_features_train = extract_all_features(train)
engine_features_test = extract_all_features(test)


train.head()

test.head()

def car_sales_preprocessing_train(data):
    # Drop columns which do not improve model predictions
    data.drop(['model', 'id', 'clean_title'], axis=1, inplace=True)
    
    # Handle missing fuel_type 
    data.fuel_type.replace('–', 'Electric', inplace=True)
    data.fuel_type.replace('not supported', 'Electric', inplace=True)

    # Extract engine features from test data
    data[['horsepower']] = data.engine.apply(extract_engine_features).apply(pd.Series)

    # Fill missing data in extracted engine features 
    data.horsepower.fillna(data.horsepower.mean(), inplace =True)
    #data.displacement.fillna(data.displacement.mean(), inplace=True)

    # Drop engine column in test data
    data.drop('engine', axis=1, inplace=True)

    # Extract car age from model_year column
    data["age"] = 2024 - data.model_year

    # Drop model_year column from test dataset
    data.drop('model_year', axis=1, inplace=True)

    for label, content in data.items():
        if pd.api.types.is_string_dtype(content) or pd.api.types.is_object_dtype(content):
            # Convert all object and string data types into categories 
            data[label] = content.astype('category').cat.as_ordered()

    # Turn categorical values into numbers 
    for label, content in data.items():
        if not pd.api.types.is_numeric_dtype(content):
            data[label] = pd.Categorical(content).codes + 1

    Q1 = data.price.quantile(0.25)
    Q3 = data.price.quantile(0.75)
    IQR = Q3 - Q1
    # Using the IQR value to filter out the outliers 
    data = data[~((data.price < (Q1 - 1.5 * IQR)) | (data.price > (Q3 + 0.7 * IQR)))]
    

    return data

def car_sales_preprocessing_test(data):
    # Drop columns which do not improve model predictions
    data.drop(['model', 'id', 'clean_title'], axis=1, inplace=True)
    
    # Handle missing fuel_type 
    data.fuel_type.replace('–', 'Electric', inplace=True)
    data.fuel_type.replace('not supported', 'Electric', inplace=True)

    # Extract engine features from test data
    data[['horsepower']] = data.engine.apply(extract_engine_features).apply(pd.Series)

    # Fill missing data in extracted engine features 
    data.horsepower.fillna(data.horsepower.mean(), inplace =True)
#     data.displacement.fillna(data.displacement.mean(), inplace=True)

    # Drop engine column in test data
    data.drop('engine', axis=1, inplace=True)

    # Extract car age from model_year column
    data["age"] = 2024 - data.model_year

    # Drop model_year column from test dataset
    data.drop('model_year', axis=1, inplace=True)

    for label, content in data.items():
        if pd.api.types.is_string_dtype(content) or pd.api.types.is_object_dtype(content):
            # Convert all object and string data types into categories 
            data[label] = content.astype('category').cat.as_ordered()

    # Turn categorical values into numbers 
    for label, content in data.items():
        if not pd.api.types.is_numeric_dtype(content):
            data[label] = pd.Categorical(content).codes + 1

#     Q1 = data.price.quantile(0.25)
#     Q3 = data.price.quantile(0.75)
#     IQR = Q3 - Q1
#     # Using the IQR value to filter out the outliers 
#     data = data[~((data.price < (Q1 - 1.5 * IQR)) | (data.price > (Q3 + 0.7 * IQR)))]
    

    return data

train = car_sales_preprocessing_train(train)
test = car_sales_preprocessing_test(test)

engine_features_train.head()

train[['displacement', 'engine_config', 'cylinder_count', 'turbo']] = engine_features_train[['displacement', 'Engine_Config', 'Num_Cylinders', 'Turbo']]
test[['displacement', 'engine_config', 'cylinder_count', 'turbo']] = engine_features_test[['displacement', 'Engine_Config', 'Num_Cylinders', 'Turbo']]

train.head(3)

test.head(3)

test.displacement.fillna(np.mean(test.displacement), inplace=True)

train.info()

train.isna().sum()

test.isna().sum()

train.displacement.fillna(np.mean(train.displacement), inplace = True)
train.cylinder_count.fillna(np.mean(int(6)), inplace = True)
test.cylinder_count.fillna(np.mean(int(6)), inplace = True)

train.head(3)

test.head(3)

# Handling Engine Config

def convert_object_to_int(data1):
    data = data1.copy()
    for label, content in data.items():
            if pd.api.types.is_string_dtype(content) or pd.api.types.is_object_dtype(content):
                # Convert all object and string data types into categories 
                data[label] = content.astype('category').cat.as_ordered()

    # Turn categorical values into numbers 
    for label, content in data.items():
        if not pd.api.types.is_numeric_dtype(content):
            data[label] = pd.Categorical(content).codes + 1
            
    return data

# train['cylinder_count'] = train.cylinder_Count
# train.drop('cylinder_Count', axis=1, inplace=True)

train

test.head()

train = convert_object_to_int(train)
test  = convert_object_to_int(test)

# split data into features and target
X = train.drop('price', axis=1)
y = train.price

# Supress all warnings 
warnings.filterwarnings("ignore")

X.head()

test.head()

train.drop(['int_col', 'ext_col'], axis=1, inplace=True)
test.drop(['int_col', 'ext_col'], axis=1, inplace=True)


train.head()

degree = [2, 4, 5]
hyper_parameters = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [4, 8, 12],
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4]
}

for deg in degree:
    poly = PolynomialFeatures(degree=deg)
    train_poly = poly.fit_transform(X)
    test_poly = poly.fit_transform(test)
    
    rf = RandomForestRegressor(n_jobs=-1, random_state=42)
    grid_search = GridSearchCV(estimator=rf, 
                               cv=5, 
                               param_grid=hyper_parameters, 
                               scoring='neg_mean_squared_error' 
                               )
    # create polynomial regressor
    poly_reg = grid_search
    
    # Fitl the model 
    poly_reg.fit(train_poly, y)
    
    print(f"Poly Degree: {deg} Best Score: {poly_reg.best_score_}")
    print(f"Best Params: {poly_reg.best_params_}")



<class 'pandas.core.frame.DataFrame'>
Index: 47678 entries, 0 to 54272
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand           47678 non-null  int8   
 1   milage          47678 non-null  int64  
 2   fuel_type       47678 non-null  int8   
 3   transmission    47678 non-null  int8   
 4   ext_col         47678 non-null  int16  
 5   int_col         47678 non-null  int8   
 6   accident        47678 non-null  int8   
 7   price           47678 non-null  int64  
 8   horsepower      47678 non-null  float64
 9   age             47678 non-null  int64  
 10  displacement    47357 non-null  float64
 11  engine_config   47678 non-null  object 
 12  cylinder_count  47217 non-null  object 
 13  turbo           47678 non-null  int64  
dtypes: float64(2), int16(1), int64(4), int8(5), object(2)
memory usage: 3.6+ MB
