# Data preparation 
u ovoj svesci cemo pripremiti podatke za modelovanje skaliranjem i adaptacijom postojecih atributa, izvodjenjem novih atributa 

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns


In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:
train = pd.read_csv('train_bezNA.csv')
test = pd.read_csv('test_bezNA.csv')
valid = pd.read_csv('valid_bezNA.csv')

In [4]:
test.shape

(10948, 25)

In [5]:
train.head()

Unnamed: 0,IsBadBuy,Auction,VehYear,VehicleAge,Make,Model,Trim,Color,Transmission,WheelType,...,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,VNST,VehBCost,WarrantyCost
0,0,MANHEIM,2007,3,CHEVROLET,IMPALA V6,LT,WHITE,AUTO,Alloy,...,10270.0,11926.0,13603.0,8167.0,9866.0,12135.0,13694.0,MA,7480.0,1974
1,0,MANHEIM,2006,3,DODGE,STRATUS V6 2.7L V6 M,SXT,BLUE,AUTO,Covers,...,6461.0,6418.0,7478.0,5638.0,6512.0,6589.0,7533.0,NC,4700.0,1389
2,0,MANHEIM,2007,3,FORD,ESCAPE 2WD 4C,XLS,BLUE,AUTO,Alloy,...,10246.0,12297.0,13975.0,8223.0,10358.0,12126.0,14046.0,MD,7970.0,920
3,0,MANHEIM,2006,4,FORD,TAURUS,SE,GREY,AUTO,Alloy,...,5759.0,7637.0,8842.0,4216.0,5578.0,7533.0,8973.0,NC,5200.0,1053
4,1,ADESA,2006,3,FORD,FIVE HUNDRED 3.0L V6,SE,BEIGE,AUTO,Alloy,...,7716.0,7267.0,8833.0,6266.0,7716.0,7267.0,8833.0,OH,6630.0,1506


In [6]:
train.columns

Index(['IsBadBuy', 'Auction', 'VehYear', 'VehicleAge', 'Make', 'Model', 'Trim',
       'Color', 'Transmission', 'WheelType', 'VehOdo', 'Nationality', 'Size',
       'TopThreeAmericanName', 'MMRAcquisitionAuctionAveragePrice',
       'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice',
       'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice',
       'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice',
       'MMRCurrentRetailCleanPrice', 'VNST', 'VehBCost', 'WarrantyCost'],
      dtype='object')

In [7]:
numeric_columns = train.select_dtypes(include='number').columns
print("Numeričke kolone:")
print(len(numeric_columns))
print(numeric_columns)

# Izdvajanje kategorijskih kolona
categorical_columns = train.select_dtypes(include='object').columns

print("\nKategorijske kolone:")
print(len(categorical_columns))
print(categorical_columns)

Numeričke kolone:
14
Index(['IsBadBuy', 'VehYear', 'VehicleAge', 'VehOdo',
       'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
       'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
       'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
       'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice',
       'VehBCost', 'WarrantyCost'],
      dtype='object')

Kategorijske kolone:
11
Index(['Auction', 'Make', 'Model', 'Trim', 'Color', 'Transmission',
       'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'VNST'],
      dtype='object')


In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def scale_column(train, test, valid, column_name):
   
    
    # Inicijalizacija StandardScaler-a i prilagođavanje samo na određenu kolonu iz trening seta
    scaler = StandardScaler()
    train[[column_name]] = scaler.fit_transform(train[[column_name]])
    
    # Skaliranje kolone u test i validacionim skupovima koristeći statistike iz trening seta
    test[[column_name]] = scaler.transform(test[[column_name]])
    valid[[column_name]] = scaler.transform(valid[[column_name]])
    
    return train, test, valid


Ovo je funkcija koja nad prosledjenim atributom standardizuje vrednosti, koristeci prosek i std devijaciju sa train seta, 
kako bismo izbegli data leakage. Korsitimo standardizaciju, jer je normalizacija osetljiva na autlajere koje smo odlucili da ostavimo jer ih ima malo i mogu biti od znacaja za predvidjanje.

### VehOdo

Kilometrazu cemo standardizovati posebno za svaku godinu starosti vozila, na osvnovu zapazanja u delu Data Introduction

In [9]:
def standardize_vehodo(train, df):
    # Računanje srednje vrednosti i standardne devijacije za svaki 'VehicleAge' u train setu
    OdoYear_mean = train.groupby('VehicleAge')['VehOdo'].mean()
    OdoYear_std = train.groupby('VehicleAge')['VehOdo'].std()
    
    
    # Mapiranje srednje vrednosti i standardne devijacije na drugi set
    df['OdoYear_mean'] = df['VehicleAge'].map(OdoYear_mean)
    df['OdoYear_std'] = df['VehicleAge'].map(OdoYear_std)
    
    # Standardizacija 'VehOdo' u drugom setu
    df['VehOdo_Standard'] = (df['VehOdo'] - df['OdoYear_mean']) / df['OdoYear_std']
    
    # Uklanjanje privremenih kolona
    df = df.drop(columns=['OdoYear_mean', 'OdoYear_std'])
    
    return df


### Veh year i vehicle age
Koristicemo vehicle age, tako da cemo veh year izbaciti
Age mozemo da standardizujemo

In [10]:
def drop_unwanted_columns(df):
    # Definišemo kolone koje treba obrisati
    columns_to_drop = [
        'VehYear',
        'Trim',
        'Transmission',
        'VNST','Auction',
        'Color',
        'MMRCurrentRetailAveragePrice',
        'MMRCurrentAuctionAveragePrice',
        'MMRCurrentRetailCleanPrice',
        'MMRCurrentAuctionCleanPrice',
        'VehOdo',
        'WheelType',
        'Nationality',
        'TopThreeAmericanName',
        'TrimCategory',
        'Make','Model'
        
        
        
    ]
    
    # Brisanje kolona iz DataFrame-a
    df.drop(columns=columns_to_drop, inplace=True)
    
    return df

### Auction

Primetili smo da postoji razlika ako je u pitanju Adesa(15% losih vozila) u odnosu na kategorije Manheim i Other (po 11%).
Napravicemo boolean kolonu isAdesa.


In [11]:
def categorize_auction(df):

    df['Adesa_Auction'] = df['Auction'].apply(lambda x: 1 if x == 'ADESA' else 0)
    
    
    return df

### Make - naziv proizvodjaca

Napravicemo 10 dummy kolona za najcesce proizvodjace, ostali ce biti u other

In [12]:
def process_make(train, df):
  
    top_10_makes = train['Make'].value_counts().nlargest(10).index
    df['Make'] = df['Make'].apply(lambda x: x if x in top_10_makes else 'other')
    one_hot_encoded_df = pd.get_dummies(df['Make'], prefix='make')  
    df = pd.concat([df, one_hot_encoded_df], axis=1)
    
    return  df

### Trim - paket opreme

Zakljucak iz prethodne faze je bio da mozemo grupisati sve pakete opreme iz seta u nekoliko kategorija. Zatim cemo ih poredjati u tri nivoa - basic mid i upper i dodelicemo im vrednosti 0 1 i 2


In [13]:
def categorize_trim(trim):
    # Definišemo kategorije na osnovu trimova
    basic_trims = {"W/T","Nor","i","Bas","Cla","1","ES", "L", "S", "SE", "LX", "EX", "LE", "GLS", "DX", "XE", "GL", "SXT", "CX", "150", "ZX3", "ZX4", "ZX5", "XL", "LXi", "ZXW", "GXE"}
    mid_range_trims = {"L30","SLE","SES","LS", "LT", "SEL", "XLT", "XLS", "SLT", "CXL", "Adv", "Cus", "SV6", "CE", "Tou", "CXS", "GS", "STX","2"}
    sport_trims = {"SX","s","GT", "ST", "R/T", "GTS", "SS", "Spo", "GTP", "ZTS", "Z71", "ZX2", "SVT", "XRS", "FX4"}
    luxury_trims = {"Lim", "Lar", "Lux", "LTZ", "Den", "GLS", "GTC", "L10", "L20", "Pro", "Max", "OZ"}
    special_trims = {"Edg","3", "Edd", "Z24", "Har", "LL", "JLX", "JLS", "Hyb", "Ent", "Ral", "ZR2", "Spe", "Ove", "RS", "Hig", "3 R", "Cin", "Exe", "Val", "Sta", "VP", "Pre", "Spy", "SC2", "SC1", "Spy", "XR", "eC", "Out", "Ult", "Maz"}

    # Podela u kategorije
    if trim in basic_trims:
        return 'Basic'
    elif trim in mid_range_trims:
        return 'Mid-Range'
    elif trim in sport_trims:
        return 'Sport'
    elif trim in luxury_trims:
        return 'Luxury'
    elif trim in special_trims:
        return 'Special'
    else:
        return 'Basic'

In [14]:
train['TrimCategory'] = train['Trim'].apply(categorize_trim)
test['TrimCategory'] = test['Trim'].apply(categorize_trim)
valid['TrimCategory'] = valid['Trim'].apply(categorize_trim)

In [15]:
def encode_trim_category(df):
 
    trim_map = {
        'Basic': 0,
        'Mid-Range': 1,
        'Sport': 2,
        'Luxury': 2,
        'Special': 2
    }
    
   
    df['TrimCategory_Encoded'] = df['TrimCategory'].map(trim_map)
    
    return df

### Transmission 
veoma je neizbalansiran podatak i nema nikakve razlike po pitanju izlazne promenljive, te cemo ga izbaciti

### Color

Ne pravi nikakvu razliku, te cemo izbaciti 

### WheelType 
Imamo tri tipa - Alloy, Covers i Special. Napravicemo dve kolone, alloy i cover. Kombinacija 0 0 ce da oznaci da je tip special


In [16]:
def map_wheel_type(df):
   
    df['Alloy'] = df['WheelType'].apply(lambda x: 1 if x == 'Alloy' else 0)
    df['Covers'] = df['WheelType'].apply(lambda x: 1 if x == 'Covers' else 0)
    
    return df

### Nationality

Ovu kategoriju smo pretvorili u bool kolonu isAmerican

In [17]:
def map_nationality(df):
   
    df['isAmerican'] = df['Nationality'].apply(lambda x: 1 if x == 'AMERICAN' else 0)
        
    return df

### TopThreeAmericanName

Kako bismo izbegli veliku multikolinearnost, ovu kolonu cemo izbaciti. Proizvodjaci ce biti pokriveni kononama Make
Vecina GM automobila je chevrolet, ford ce vec postojati kao i crysler


### Size 

Poredjali smo po velicini vozila kategorije koje su nam date

In [18]:
size_class = {'COMPACT': 0, 'MEDIUM': 1, 'LARGE': 2, 'SPORTS': 3, 'SPECIALTY': 3, 'VAN': 4, 'SMALL SUV': 5,
             'CROSSOVER': 5, 'MEDIUM SUV': 6, 'LARGE SUV': 7, 'SMALL TRUCK': 8, 'LARGE TRUCK': 9}

In [19]:
def encode_size_category(df):
 
    size_class = {'COMPACT': 0, 'MEDIUM': 1, 'LARGE': 2, 'SPORTS': 3, 'SPECIALTY': 3, 'VAN': 4, 'SMALL SUV': 5,
             'CROSSOVER': 5, 'MEDIUM SUV': 6, 'LARGE SUV': 7, 'SMALL TRUCK': 8, 'LARGE TRUCK': 9}
    df['Size'] = df['Size'].map(size_class)
    
    return df

### MMR indeksi


Current indekse cemo da izbacimo, iz vise razloga, poput sta ako se pojavi model automobila koji nije postojao u vreme kreiranja seta, necemo moci u buducnosti da imamo taj podatak.

Ostali indeksi su izrazito korelisani, te cemo za svaki automobil racunati prosek 4 mmr indeksa, a potom cemo taj prosek standardzivati

In [20]:
def mmr_average(df):
    
    df['MMRAverage'] = (df['MMRAcquisitionAuctionCleanPrice'] + df['MMRAcquisitonRetailCleanPrice'] +
                        df['MMRAcquisitionAuctionAveragePrice'] + df['MMRAcquisitionRetailAveragePrice']) / 4
    
    
    df.drop(columns=['MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitonRetailCleanPrice', 
                     'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice'], inplace=True)
    
    return df


### VehBCost i WarrantyCost

Ovu vrednost cemo standardizovati, koristeci brojeve sa train seta, na sve setove

In [21]:
def standardize(train, test, valid, columns):
    for column_name in columns:
        train, test, valid = scale_column(train, test, valid, column_name)
    return train, test, valid

In [22]:
columns_to_scale = ['VehBCost', 'WarrantyCost','MMRAverage','CostPerYear','WarrantyPerMile','CostPerMile']  


In [23]:
def process_dataset(train, df):
    
    
    df = cost_miles_cols(df)

    df = standardize_vehodo(train, df)
    

    df = categorize_auction(df)
    

    df = process_make(train, df)
    

    df = encode_trim_category(df)
    
 
    df = map_wheel_type(df)

    df = map_nationality(df)
    

    df = mmr_average(df)
    

    df = encode_size_category(df)
    

    df = drop_unwanted_columns(df)
    
    
    return df

In [25]:
def cost_miles_cols(df):
    df['CostPerMile'] = df['VehBCost'] / df['VehOdo'] # std
    
    df['WarrantyPerMile'] = df['WarrantyCost'] / df['VehOdo']
 
    df['Age_help'] = df['VehicleAge'] + 1
    df['CostPerYear'] = df['VehBCost'] / df['Age_help'] ## mora std
    df.drop(columns='Age_help',inplace=True)
    return df

In [26]:
test.columns

Index(['IsBadBuy', 'Auction', 'VehYear', 'VehicleAge', 'Make', 'Model', 'Trim',
       'Color', 'Transmission', 'WheelType', 'VehOdo', 'Nationality', 'Size',
       'TopThreeAmericanName', 'MMRAcquisitionAuctionAveragePrice',
       'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice',
       'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice',
       'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice',
       'MMRCurrentRetailCleanPrice', 'VNST', 'VehBCost', 'WarrantyCost',
       'TrimCategory'],
      dtype='object')

In [27]:

valid = process_dataset(train, valid)

test = process_dataset(train,test)

train = process_dataset(train,train)

train,test,valid = standardize(train,test,valid,columns_to_scale)

In [28]:
print(valid.isnull().sum().sum())

0


In [29]:
train.shape

(51088, 26)

def process_dataset_cat(train, df):
    # Standardizacija kolone 'VehOdo'
    df = standardize_vehodo(train, df)
    
    # Kategorizacija aukcije
    df = categorize_auction(df)
    
    # Procesiranje proizvođača automobila
    df = process_make(train, df)
    
    # Enkodiranje trim kategorije
    df = encode_trim_category(df)
    
    # Mapiranje tipa točkova
    df = map_wheel_type(df)
    
    # Mapiranje nacionalnosti
    df = map_nationality(df)
    
    # Prosečna MMR vrednost
    df = mmr_average(df)
    
    # Enkodiranje kategorije veličine
    df = encode_size_category(df)
    
    # Brisanje nepotrebnih kolona
    df = drop_unwanted_columns(df)
    
    return df

train_cat = pd.read_csv('train_bezNA.csv')
test_cat = pd.read_csv('test_bezNA.csv')
valid_cat = pd.read_csv('valid_bezNA.csv')

train_cat['TrimCategory'] = train_cat['Trim'].apply(categorize_trim)
test_cat['TrimCategory'] = test_cat['Trim'].apply(categorize_trim)
valid_cat['TrimCategory'] = valid_cat['Trim'].apply(categorize_trim)

train_cat =process_dataset_cat(train_cat,train_cat)

test_cat=process_dataset_cat(train_cat,test_cat)
valid_cat=process_dataset_cat(train_cat,valid_cat)



columns_to_scale

train_cat,test_cat,valid_cat = standardize(train_cat,test_cat,valid_cat,columns_to_scale)

train_cat

train_cat.to_csv('train_cat.csv',index=False)
test_cat.to_csv('test_cat.csv',index=False)
valid_cat.to_csv('valid_cat.csv',index=False)

In [30]:
train.to_csv('train_processed.csv',index=False)
valid.to_csv('valid_processed.csv',index=False)
test.to_csv('test_processed.csv',index=False)


In [33]:
train.shape[0] + test.shape[0] + valid.shape[0]

72983