In [None]:
# **DATA PROCESSING**

import numpy as np # Array Porcessing 
import pandas as pd # Data Processing

# **DATA ANALYSIS**

import matplotlib.pyplot as plt # Plots
import seaborn as sns # Graphs

# ****

from sklearn.preprocessing import StandardScaler # Scaling of Data

# **MACHINE LEARNING MODELS**

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor

# **METRICS**

from sklearn.metrics import r2_score 

# **INPUT**

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Lets get our data into working

In [None]:
data = pd.read_csv("/kaggle/input/car-price-prediction-challenge/car_price_prediction.csv")

It is a good practice to take a look at our dataset before processing it

In [None]:
data

So we have $19237$ rows and $18$ columns accounting to $347,886$. `ID` column seems to be unique for all the values so just drop it. 

In [None]:
data.drop("ID" , axis = 1 , inplace = True)

And now our dataset, looks like this 

In [None]:
data

Lets get a info of our dataset

In [None]:
data.info()

Now we need to work on this dataset, lets work chronological here. `Price` is the target of our model and is in the integer type. Next comes the `Levy` column, which is kind of tax column in object datatype, so we need to convert this into integer type 

In [None]:
data.replace(to_replace = "-" , value = 0 , inplace = True)
tra_1 = data["Levy"].astype(int)
data.drop("Levy" , axis = 1 , inplace = True)
data = pd.concat([data , tra_1] , axis = 1 , join = "inner")

Now we have managed with the `Levy` column

In [None]:
data

Next comes the `Leather interior` column. This is in `yes` and `no`. format. It would be great if we denoted `yes with 1` and `no with 0`. We are using `replace` keyword here. There are three more options like `get_dummies` from pandas `OneHotEncoder` and `Ordinal Encoder` from sklearn. They will also do the same thing, just doing it in a different way. In case you want to use those here is the code.
```
# *****************************REPLACE*****************************
import pandas as pd 

data.replace(to_replace , value  , inplace = True)

# *****************************PANDAS GET DUMMIES*****************************

import pandas as pd

# **************METHOD 1**************

data = pd.get_dummies(dataframe , columns , drop_first = True)

# **************METHOD 2**************

data = pd.get_dummies(dataframe , columns)

# *****************************ORDINAL ENCODER*****************************

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# **************METHOD 1**************

oe = OrdinalEncoder()
data = oe.fit_transform(data)
new_data = pd.DataFrame(new_data)

sample_data = data.drop(cat , axis = 1)

data_proc = pd.concat([sample_data , new_data] , axis = 1 , join = "inner")
data_proc = pd.DataFrame(data_proc)

# **************METHOD 2**************

oe = OrdinalEncoder(columns = ["No" , "Yes"])
data = oe.fit_transform(data)
new_data = pd.DataFrame(new_data)

sample_data = data.drop(cat , axis = 1)

data_proc = pd.concat([sample_data , new_data] , axis = 1 , join = "inner")
data_proc = pd.DataFrame(data_proc)

# *****************************ONE HOT ENCODER*****************************

import pandas as pd 
from sklearn.preprocssing import OneHotEncoder

# **************METHOD 1**************

ohe = OneHotEncoder(drop = "first" , sparse = False)

new_data = ohe.fit_transform(data[cat])
new_data = pd.DataFrame(new_data)

sample_data = data.drop(cat , axis = 1)

data_proc = pd.concat([sample_data , new_data] , axis = 1 , join = "inner")
data_proc = pd.DataFrame(data_proc)

# **************METHOD 2**************

ohe = OneHotEncoder(drop = "first" , sparse = True)

new_data = ohe.fit_transform(data[cat])
new_data = new_data.to_aaray()
new_data = pd.DataFrame(new_data)

sample_data = data.drop(cat , axis = 1)

data_proc = pd.concat([sample_data , new_data] , axis = 1 , join = "inner")
data_proc = pd.DataFrame(data_proc)
```

I will be using `get_dummies` for now 

In [None]:
data.replace(to_replace = "Yes" , value = 1 , inplace = True)
data.replace(to_replace = "No" , value = 0 , inplace = True)

Now lets take look a data 

In [None]:
data

Now lets work on the mileage column

In [None]:
trans_1 = data["Mileage"].str.split(" km" , n = 0 , expand = True)
data.drop("Mileage" , axis = 1 , inplace = True)
tra_2 = trans_1[0].astype(int)
data["Mileage"] = tra_2

In [None]:
data 

Now comes the `Engine volume` part 

In [None]:
trans_2 = data["Engine volume"].str.split(" Turbo" , n = 1 , expand = True)
tra_3 = trans_2[0].astype(float)
data["Engine volume1"] = tra_3
data["Turbo"] = trans_2[1]

In [None]:
data.drop("Engine volume" , axis = 1 , inplace = True)

In [None]:
data.replace(to_replace = "" , value = 1 , inplace = True)
data["Turbo"].fillna(value = 0 , axis = 0 , inplace = True)

In [None]:
data

Now lets get a info of our data 

In [None]:
data.info()

Lets now segregate the values and also plot them respectively 

In [None]:
for i in data.columns:
    if data[i].dtypes == object:
        data[i].value_counts().plot(kind = "pie" , autopct = "%.2f")
        plt.show()
    else:
        sns.distplot(data[i])
        plt.show()

As we can see applying one hot encoding on all of the columns will make our dataset very big to compute, so we will combine the smaller values in one category that is `other `

In [None]:
def dummies(column , threshold , data):
    y = column + "_other"
    
    repl = data[column].value_counts()[data[column].value_counts() <= threshold].index
    repla = pd.get_dummies(data[column].replace(repl , y))
    
    data.drop(column , axis = 1 , inplace = True)
    
    data = pd.concat([data , repla] , axis = 1 , join = "inner")

    return data

For the manufacturere, the threshold will be 800

In [None]:
data = dummies("Manufacturer" , 800 , data)
data = dummies("Model" , 400 , data)
data = dummies("Category" ,500 , data)
data = dummies("Fuel type" , 800 , data)
data = dummies("Color" , 600 , data)

Now comes the normal catgorical columns 

In [None]:
data = pd.get_dummies(data , columns = ["Gear box type" , "Drive wheels" , "Doors" , "Wheel"] , drop_first = True)

Now fill the null values 

In [None]:
data["Turbo"].fillna(value = 0 , axis = 0 , inplace = True)

In [None]:
tra_3 = data["Turbo"].astype(int)

Now we will divide our data into train and test 

In [None]:
train , test = np.split(data.sample(frac = 1) , [int(0.8 * len(data))])

In [None]:
def pre(dataframe):
    x = dataframe.drop("Price" , axis = 1)
    y = dataframe["Price"]
    
    sc = StandardScaler()
    
    x = sc.fit_transform(x)
    
    return x , y

In [None]:
X_train , Y_train = pre(train)
X_test , Y_test = pre(test)

Now lets train our model and test it 

In [None]:
model_0 = KNeighborsRegressor()
model_0.fit(X_train , Y_train)

In [None]:
model_1 = LinearRegression()
model_1.fit(X_train , Y_train)

In [None]:
model_2 = RandomForestClassifier()
model_2.fit(X_train , Y_train)

In [None]:
model_3 = SVC()
model_3.fit(X_train , Y_train)

In [None]:
model_4 = DecisionTreeRegressor()
model_4.fit(X_train , Y_train)

In [None]:
print(r2_score(Y_test , model_0.predict(X_test)))
print(r2_score(Y_test , model_1.predict(X_test)))
print(r2_score(Y_test , model_2.predict(X_test)))
print(r2_score(Y_test , model_3.predict(X_test)))
print(r2_score(Y_test , model_4.predict(X_test)))

Look we upgraded it to $17$ percent. 

**THATS IT FOR TODAY GUYS**

**WE WILL BE MAKING IMPROVMENTS IN FUTURE, STAY TUNED FOR THAT**

**DONT FORGET TO MAKE AN UPVOTE IF YOU LIKED IT, IT HELPS :)**

**PEACE OUT !!!**

# Version Info

* **Version 1 - Raw Code**
* **Version 2 - Documnetation**
* **Version 3 - SC Additon**
* **Version 4 - Model Addition**

In [None]:
nan