## <span style="color:blue">Business Goal  </span>

You are required to model the price of cars with the available independent variables.

# Import library

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns


from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max.columns', None)

# Importing dataset and exploration

In [None]:
data_tyt = pd.read_csv("/Users/deviyantiam/Documents/ML-Session2/Lab/toyota.csv")
print(data_tyt.shape)
data_tyt.head()

# Check if there's any missing value

In [None]:
data_tyt.isnull().sum()

In [None]:
data_tyt.describe()

In [None]:
data_tyt.dtypes

In [None]:
for i in data_tyt.columns:
    if data_tyt[i].dtypes=='object':
        print(i,'=',data_tyt[i].unique(),'\n')
        


# Exploratory data analysis

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Car Price Distribution Plot')
sns.distplot(data_tyt.price)


plt.show()

#### Inference :

The plot seemed to be right-skewed, meaning that the most prices in the dataset are low(Below 15,000).

In [None]:
sns.countplot(data_tyt["transmission"])
plt.show()

Most of the cars on the dataset are with .... transmission

In [None]:
print('Model sold in %')
print(data_tyt["model"].value_counts() / len(data_tyt)*100)
sns.countplot(y = data_tyt["model"])
plt.show()

Top 3 cars are 
<br>
- 
- 
- 
<br>on the dataset constuite ....% of all the toyota cars, with all other cars contributing to ...%

In [None]:
sns.countplot(data_tyt["fuelType"])
plt.show()

Most Toyota cars use ...Engine

In [None]:
sns.countplot(y = data_tyt["year"])
plt.show()

Most cars were produced in ...

In [None]:
plt.figure(figsize=(15,5),facecolor='w') 
sns.barplot(x = data_tyt["year"], y = data_tyt["price"])
plt.show()

The recently manufactured cars (year = 2019, 2020) are sold for more average price when compared to the cars that are manufactured earlier.

In [None]:
sns.barplot(x = data_tyt["transmission"], y = data_tyt["price"])
plt.show()

.... cars were sold for the highest price

In [None]:
plt.figure(figsize=(15,10),facecolor='w') 
sns.scatterplot(data_tyt["mileage"], data_tyt["price"], hue = data_tyt["year"])
plt.show()

as newer the car is and as smaller the mileage is, the higher price it is worth

In [None]:
plt.figure(figsize=(15,5),facecolor='w') 
sns.scatterplot(data_tyt["mileage"], data_tyt["price"], hue = data_tyt["fuelType"])
plt.show()

petrol cars tend to cost cheaper

In [None]:
sns.pairplot(data_tyt)

### Try to calculate how old the car is by ruling out cars from 2020 and subtracting 2000 from the year field

In [None]:
data_tyt["age_of_car"] = 2020 - data_tyt["year"]
data_tyt = data_tyt.drop(columns = ["year"])
data_tyt.sample(10)

# Pre-processing for modeling

I like to use pd.get_dummies option over OHE in SKLearn to get the one hot encoded variables for the categorical variables. It is usually tidy on the dataset and the column names are preserved.

In [None]:
data_tyt_expanded = pd.get_dummies(data_tyt)
data_tyt_expanded.head()

Applying the standard scalar option to standardize all the variables in the dataset.

In [None]:
std = StandardScaler()
data_tyt_expanded_std = std.fit_transform(data_tyt_expanded)

In [None]:
data_tyt_expanded_std = pd.DataFrame(data_tyt_expanded_std, columns = data_tyt_expanded.columns)
print(data_tyt_expanded_std.shape)
data_tyt_expanded_std.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_tyt_expanded_std.drop(columns = ['price']), data_tyt_expanded_std[['price']],test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Modeling

In [None]:
def regression_model(model,xtrain,ytrain,xtest,ytest):
    """
    Will fit the regression model passed and will return the regressor object and the score
    """
    regressor = model
    regressor.fit(xtrain, ytrain)
    score = regressor.score(xtest, ytest)
    return regressor, score

In [None]:
model_performance = pd.DataFrame(columns = ["Features", "Model", "Score"])

model= LinearRegression()
regressor, score = regression_model(model,X_train, y_train, X_test, y_test )
model_performance = model_performance.append({"Features": "Linear","Model": "Linear", "Score": score}, ignore_index=True)
pd.set_option('max_colwidth', -1)
model_performance

In [None]:
regressor.predict(X_test)[:5]

In [None]:
inverse_data_test=std.inverse_transform(np.column_stack((regressor.predict(X_test),X_test)))

In [None]:
Inverse_test = pd.DataFrame(inverse_data_test, columns = data_tyt_expanded.columns)

In [None]:
Inverse_test.head()

## Selecting best features for model

Since there are 31 variables in the dataset after the one hot encoding, I am using SelectKBest option from sklearn to select the best features from the dataset for applying the regression.

For this, I am executing the SelectKBest() on f_regression by taking into consideration from 3 variables to 31 variables to see where we get the best score. 



In [None]:
column_names = data_tyt_expanded.drop(columns = ['price']).columns

no_of_features = []
r_squared_train = []
r_squared_test = []

for k in range(3, 31, 2):
    selector = SelectKBest(f_regression, k = k)
    X_train_transformed = selector.fit_transform(X_train, y_train)
    X_test_transformed = selector.transform(X_test)
    regressor1 = LinearRegression()
    regressor1.fit(X_train_transformed, y_train)
    no_of_features.append(k)
    r_squared_train.append(regressor1.score(X_train_transformed, y_train))
    r_squared_test.append(regressor1.score(X_test_transformed, y_test))
    
sns.lineplot(x = no_of_features, y = r_squared_train, legend = 'full')
sns.lineplot(x = no_of_features, y = r_squared_test, legend = 'full')
plt.show()

We get score of 0.88 around 16 variables befor the curve stablizes. Hence keeping k as 16 selecting 16 best variables from the dataset

In [None]:
selector = SelectKBest(f_regression, k = 16)
X_train_transformed = selector.fit_transform(X_train, y_train)
X_test_transformed = selector.transform(X_test)
column_names[selector.get_support()]

In [None]:
model1= LinearRegression()
regressor2, score2 = regression_model(model1,X_train_transformed,y_train,X_test_transformed,y_test)
model_performance = model_performance.append({"Features": "Linear+FeatureSelection","Model": "Linear", "Score": score2}, ignore_index=True)
pd.set_option('max_colwidth', -1)
model_performance

The score is ....

# Multilinear regression (scratch)

In [None]:
# H = np.array(X_train_transformed)
H = X_train_transformed
# Add a vector of 1s to the numpy array to represent the 1st feature
ones = np.ones(len(H))
H = np.column_stack((ones,H))

In [None]:
# Calculate the parameter weights using the Closed Form solution of multiple linear regression
model_scratch = np.dot(np.linalg.pinv(np.dot(np.transpose(H),H)),np.dot(np.transpose(H),y_train))

In [None]:
# Now we have the parameter weights and this can be used to make predictions on unseen data.
ones = np.ones(len(X_test_transformed))
H = np.column_stack((ones,X_test_transformed))
prediction = np.dot(H,model_scratch)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, prediction)

In [None]:
y_=regressor.predict(X_test)

In [None]:
y_predtrain=regressor.predict(X_train)

In [None]:
y_fstrain=regressor2.predict(X_train_transformed)

In [None]:
y_fs=regressor2.predict(X_test_transformed)

In [None]:
from sklearn.metrics import mean_squared_error
def rms(y_act, y_pred,name):
    print('MSE',name,mean_squared_error(y_act, y_pred))
    print('RMSE',name,np.sqrt(mean_squared_error(y_act, y_pred)))
def adjr2(R2,n,p):
    print('adj R2',1-(1-R2)*(n-1)/(n-p-1))

In [None]:
rms(y_test,y_,'test')
rms(y_train,y_predtrain,'train')

n=len(y_test)
p=X_test.shape[1]
adjr2(score,n,p)

In [None]:
rms(y_test,y_fs,'FS+test')
rms(y_train,y_fstrain,'FS+train')
n=len(y_test)
p=X_test_transformed.shape[1]
adjr2(score,n,p)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X_test[['mileage','tax','mpg','engineSize','age_of_car']].values, i) for i in range(X_test[['mileage','tax','mpg','engineSize','age_of_car']].shape[1])]
vif["features"] = X_test[['mileage','tax','mpg','engineSize','age_of_car']].columns
vif

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X_test_transformed, i) for i in range(np.shape(X_test_transformed)[1])]
vif["features"] = column_names[selector.get_support()]
vif