# 1. 데이터셋 읽고 구조 확인

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

In [2]:
dummy_data = pd.read_csv('./data/car_train.csv')
dummy_data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
dummy_data.shape

(6019, 14)

# 2. 데이터 전처리

In [5]:
# 결측치 제거
clean_data = dummy_data.copy(deep=True)
clean_data.dropna('index').shape

# New_Price 컬럼의 이상치 확인
print(str(clean_data.New_Price.isna().sum()))

clean_data.drop(columns=['Unnamed: 0', 'New_Price'], inplace=True)

clean_data.dropna("index", inplace=True)
clean_data = clean_data.reset_index(drop = True)
clean_data.shape

5195


(5975, 12)

In [9]:
# 차 종류 수 정의
len(np.unique(list(clean_data.Name))) # 1855
# 차 종별 갯수가 2개 이상인 종류만해도 1034종
# 모델을 너무 복잡하게 해서 과적합 초래 가능성이있음

# 차종에서 브랜드만 남기고 제거
names = list(clean_data.Name)
for i in range(len(names)):
    names[i] = names[i].split(' ', 1)[0]
clean_data.Name = names
clean_data.head()

len(np.unique(list(clean_data.Name)))

31

In [10]:
# Mileage의 km/kg 제거 Engine의 CC제거, power의 bhp제거

mileage = list(clean_data.Mileage)
engine = list(clean_data.Engine)
power = list(clean_data.Power)

for i in range(len(names)):
    mileage[i] = mileage[i].split(' ', 1)[0]
    engine[i] = engine[i].split(' ', 1)[0]
    power[i] = power[i].split(' ', 1)[0]

clean_data.Mileage = mileage
clean_data.Engine = engine
clean_data.Power = power
clean_data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Mumbai,2010,72000,CNG,Manual,First,26.6,998,58.16,5.0,1.75
1,Hyundai,Pune,2015,41000,Diesel,Manual,First,19.67,1582,126.2,5.0,12.5
2,Honda,Chennai,2011,46000,Petrol,Manual,First,18.2,1199,88.7,5.0,4.5
3,Maruti,Chennai,2012,87000,Diesel,Manual,First,20.77,1248,88.76,7.0,6.0
4,Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968,140.8,5.0,17.74


In [11]:
clean_data['Price'] = clean_data['Price'].astype(float)
clean_data['Kilometers_Driven'] = clean_data['Kilometers_Driven'].astype(float)
clean_data['Mileage'] = clean_data['Mileage'].astype(float)
clean_data['Engine'] = clean_data['Engine'].astype(float)
(clean_data.dtypes)

Name                  object
Location              object
Year                   int64
Kilometers_Driven    float64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage              float64
Engine               float64
Power                 object
Seats                float64
Price                float64
dtype: object

In [12]:
clean_data = clean_data[clean_data.Seats != 0]

In [15]:
np.unique(list(clean_data.Power))

idx = []
lt = list(clean_data['Power'])
for i in range(len(lt)):
    if( lt[i] == 'null'):
        idx.append(i)
clean_data = clean_data.drop(idx)
clean_data = clean_data.reset_index(drop = True)

In [16]:
clean_data['Power'] = clean_data['Power'].astype(float)

In [17]:
# 카테고리별로 분류하기 Name, Location, Owner_Type, Seats

clean_data['Year'] = pd.Categorical(clean_data['Year'])
clean_data['Seats'] = pd.Categorical(clean_data['Seats'])

clean_data = pd.get_dummies(clean_data, prefix_sep='_', drop_first=True)

In [18]:
clean_data.head()

Unnamed: 0,Kilometers_Driven,Mileage,Engine,Power,Price,Name_Audi,Name_BMW,Name_Bentley,Name_Chevrolet,Name_Datsun,...,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third,Seats_4.0,Seats_5.0,Seats_6.0,Seats_7.0,Seats_8.0,Seats_9.0,Seats_10.0
0,72000.0,26.6,998.0,58.16,1.75,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,41000.0,19.67,1582.0,126.2,12.5,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,46000.0,18.2,1199.0,88.7,4.5,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,87000.0,20.77,1248.0,88.76,6.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,40670.0,15.2,1968.0,140.8,17.74,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0


In [19]:
clean_data.shape

(5844, 79)