In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder

In [2]:
# read both files
train=pd.read_csv('training.csv')
test=pd.read_csv('test.csv')

In [3]:
# saving target variables
y=train['SellingPrice']
test_pred=test['SellingPrice']

In [4]:
# joining train n test sets for easier preprocessing
joined_data=train.append(test)

In [5]:
# dropping irrelevant columns
joined_data=joined_data.drop(['Unnamed: 0','SellingPrice'],axis=1)

In [6]:
# separating train n test sets
train=joined_data[0:train.shape[0]]
test=joined_data[train.shape[0]:]

In [7]:
# columns within dataset
train.columns

Index(['Kilometeres', 'Fuel_Type', 'Doors', 'Automatic', 'HorsePower',
       'MetallicCol', 'CC', 'Wt', 'Age'],
      dtype='object')

In [8]:
# Doesn't look like the data has any anomalies
train.describe()

Unnamed: 0,Kilometeres,Doors,Automatic,HorsePower,MetallicCol,CC,Wt,Age
count,958.0,958.0,958.0,958.0,958.0,958.0,958.0,958.0
mean,69067.598121,4.009395,0.056367,101.549061,0.669102,1566.651357,1072.701461,56.463466
std,37082.673742,0.955059,0.23075,14.556315,0.470782,184.850459,55.235835,18.332019
min,1.0,2.0,0.0,69.0,0.0,1300.0,1000.0,1.0
25%,43472.0,3.0,0.0,90.0,0.0,1400.0,1040.0,44.0
50%,63547.0,4.0,0.0,110.0,1.0,1600.0,1065.0,61.0
75%,87580.0,5.0,0.0,110.0,1.0,1600.0,1085.0,70.0
max,243000.0,5.0,1.0,192.0,1.0,2000.0,1615.0,80.0


In [9]:
# No missing values, thi is good as the dataset is already quite small
print("Number of missing values",train.isnull().sum())

Number of missing values Kilometeres    0
Fuel_Type      0
Doors          0
Automatic      0
HorsePower     0
MetallicCol    0
CC             0
Wt             0
Age            0
dtype: int64


In [10]:
# Target variable appears to be skewed
print(y.skew())

1.9049268513861988


In [11]:
# encode fuel type so that it can be fed to an ML Model
lb=LabelEncoder()
train['Fuel_Type']=lb.fit_transform(train['Fuel_Type'])
test['Fuel_Type']=lb.transform(test['Fuel_Type'])

In [12]:
# saving prepped files and labels to disk
test.to_csv('prepped_test.csv',index=False)
train.to_csv('prepped_train.csv',index=False)
y.to_frame().to_csv('target.csv',index=False)
test_pred.to_frame().to_csv('target_test.csv',index=False)