# Training and finding best model for the submission

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
# loading data
df = pd.read_csv("../data2/train.csv", index_col='id')
test_df = pd.read_csv("../data2/test.csv", index_col='id')

In [3]:
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [4]:
# bining 
ord_col = ['brand', 'model', 'ext_col', 'int_col']
cat_col = ['accident', 'clean_title']
num_col = ['model_year', 'milage']

In [64]:
# functions
def handle_nan(df):
    '''func to handle missing values'''
    df = df.copy()
    df['fuel_type'] = df['fuel_type'].replace([np.nan, '–'], 'missing')
    df['accident'] = df['accident'].replace([np.nan], 'None reported')
    df['clean_title'] = df['clean_title'].replace(np.nan, 'no')
    
    return df

# function to handle transmission 
def get_transmission_type(text):
    if 'M/T' in text or 'Manual' in text:
        return 'Manual'
    elif 'A/T' in text or 'Automatic' in text:
        return 'Automatic'
    elif 'CVT' in text:
        return 'CVT'
    else:
        return 'Other'

def get_num_speeds(text):
    match = re.search(r'(\d+)-Speed', text)
    if match:
        return int(match.group(1))
    else:
        return 0

def get_shift_mode(text):
    if 'Dual Shift' in text or 'Auto-Shift' in text:
        return 'Dual/Auto-Shift'
    elif 'Manual' in text:
        return 'Manual'
    else:
        return 'Standard'

def get_transmission_technology(text):
    if 'DCT' in text:
        return 'DCT'
    elif 'CVT' in text:
        return 'CVT'
    elif 'Auto-Shift' in text:
        return 'Auto-Shift'
    else:
        return 'unknown'

def get_production_status(text):
    if 'SCHEDULED' in text:
        return 1
    else:
        return 0



def feature_extractor(df):
    '''func to extract features from the dataframe'''
    df = df.copy()
    df['horse_power'] = df['engine'].apply(extract_hp)
    df['fuel_capacity'] = df['engine'].apply(extract_fc)
    df['trans_type'] = df.transmission.apply(get_transmission_type)
    df['num_speed'] = df.transmission.apply(get_num_speeds)
    df['shift_mode'] = df.transmission.apply(get_shift_mode)
    df['trans_tech'] = df.transmission.apply(get_transmission_technology)
    df['prod_status'] = df.transmission.apply(get_production_status)

    return df


# extract horse power
def extract_hp(text):
    '''func to extract hp'''
    hp_match = re.search(r'(\d+\.?\d*)HP', text)
    return float(hp_match.group(1)) if hp_match else np.nan

# extract fuel capacity
def extract_fc(text):
    '''func to extract fuel capacity'''
    fc_match = re.search(r'(\d+\.?\d*)L', text)
    return float(fc_match.group(1)) if fc_match else np.nan




In [None]:
# applying transformation
df = 