In [27]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import fbeta_score
from sklearn.metrics import r2_score,mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('./USA_Housing.csv')

#### Q1. K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)

Download the dataset regarding USA House Price Prediction from the following link:

https://drive.google.com/file/d/1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX/view?usp=sharing

Load the dataset and Implement 5- fold cross validation for multiple linear regression (using least square error fit).

Steps:
1. Divide the dataset into input features (all columns except price) and output variable
(price)
2. Scale the values of input features.
3. Divide input and output features into five folds.
4. Run five iterations, in each iteration consider one-fold as test set and remaining
four sets as training set. Find the beta (𝛽) matrix, predicted values, and R2_score
for each iteration using least square error fit.
5. Use the best value of (𝛽) matrix (for which R2_score is maximum), to train the
regressor for 70% of data and test the performance for remaining 30% data.

In [28]:
x = df[['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms','Area Population']]
y = df[['Price']]

In [29]:
scaler = MinMaxScaler()
x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)
y = pd.DataFrame(scaler.fit_transform(y), columns=y.columns)

In [30]:
beta_mat = 0
r2Score = 0
max = -1
for i in range(1,6):
  x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=i*10)
  reg = linear_model.LinearRegression()
  reg.fit(x_train,y_train)
  predict_data = reg.predict(x_test)
  score = r2_score(y_test,predict_data)
  if score > r2Score :
    r2Score = score
    beta_mat = reg.coef_
    max = i
print(r2Score)
print(beta_mat)

0.9186390166146279
[[0.78888977 0.46637128 0.36741804 0.00327103 0.43562802]]


In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=max*10)
reg = linear_model.LinearRegression()
reg.fit(x_train,y_train)
predict_data = reg.predict(x_test)
score = r2_score(y_test,predict_data)
print(mean_absolute_error(y_test,predict_data))

0.03290858400261874


#### Q2. Concept of Validation set for Multiple Linear Regression (Gradient Descent Optimization)

Consider the same dataset of Q1, rather than dividing the dataset into five folds, divide the
dataset into training set (56%), validation set (14%), and test set (30%).
Consider four different values of learning rate i.e. {0.001,0.01,0.1,1}. Compute the values of
regression coefficients for each value of learning rate after 1000 iterations.
For each set of regression coefficients, compute R2_score for validation and test set and find
the best value of regression coefficients. 

In [32]:
import pandas as pd
import numpy as np
from random import randrange
from sklearn.metrics import r2_score

def train_test_split(dataset,split):
    train=pd.DataFrame()
    train_size = split*len(dataset)
    dataset_copy = dataset.copy()
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train = train.append(dataset_copy.iloc[index],ignore_index=True)
        dataset_copy = dataset_copy.drop(labels=index,axis=0)
        temp = pd.DataFrame()
        temp = temp.append(dataset_copy,ignore_index=True)
        dataset_copy=temp
    return train,dataset_copy

# Regression model using gradient descent method
def model(X,Y,learning_rate,iterations):
    m=Y.size
    theta=np.zeros((X.shape[1],1))
    cost_list=[]
    for i in range(iterations):
        y_pred=np.dot(X,theta)
        cost=(1/(2*m))*np.sum(np.square(y_pred-Y))
        d_theta=(1/m)*np.dot(X.T,y_pred-Y)
        theta=theta-learning_rate*d_theta
        cost_list.append(cost)
    return theta,cost_list
  
df = pd.read_csv('./USA_Housing.csv')

# Normalization
df=(df-df.min())/(df.max()-df.min())

beta_mat = []
beta_mat = np.array(beta_mat)
r2Score_validate = -1000
r2Score_test = -1000
df_train_validate,df_test = train_test_split(df,0.7)
df_train,df_validate = train_test_split(df_train_validate,0.8)
x_train = df_train[['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms','Area Population']]
y_train = df_train[['Price']]
x_validate = df_validate[['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms','Area Population']]
y_validate = df_validate[['Price']]
x_test = df_test[['Avg. Area Income','Avg. Area House Age','Avg. Area Number of Rooms','Avg. Area Number of Bedrooms','Area Population']]
y_test = df_test[['Price']]
xnew=np.vstack((np.ones((x_train.shape[0],)),x_train.T)).T 
x_test=np.vstack((np.ones((x_test.shape[0],)),x_test.T)).T 
x_validate=np.vstack((np.ones((x_validate.shape[0],)),x_validate.T)).T
for i in range(1,5):
    iterations=1000
    learning_rate=0.0001*(10**i)
    theta,cost_list=model(xnew,y_train,learning_rate=learning_rate,iterations=iterations)
    y_pred_validate=np.dot(x_validate,theta)
    y_pred_test = np.dot(x_test,theta)

    r2_score_validate = r2_score(np.array(y_validate),np.array(y_pred_validate))
    r2_score_test = r2_score(np.array(y_test),np.array(y_pred_test))

    if r2_score_validate > r2Score_validate:
        r2Score_validate = r2_score_validate
        beta_mat = theta.copy()

    if r2_score_test > r2Score_test:
        r2Score_test = r2_score_test
        beta_mat = theta.copy()
        
print(r2Score_test)
print(r2Score_validate)
print(beta_mat)

0.812229589140265
0.8052658928201835
[[-0.21830241]
 [ 0.49780189]
 [ 0.3419852 ]
 [ 0.21325894]
 [ 0.02454439]
 [ 0.29697894]]


#### Q3. Pre-processing and Multiple Linear Regression

Download the dataset regarding Car Price Prediction from the following link:
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
1. Load the dataset with following column names ["symboling", "normalized_losses",
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
and replace all ? values with NaN
2. Replace all NaN values with central tendency imputation. Drop the rows with NaN
values in price column
3. There are 10 columns in the dataset with non-numeric values. Convert these values to
numeric values using following scheme:
(i) For “num_doors” and “num_cylinders”: convert words (number names) to figures
for e.g., two to 2
(ii) For "body_style", "drive_wheels": use dummy encoding scheme
(iii) For “make”, “aspiration”, “engine_location”,fuel_type: use label encoding
scheme
(iv) For fuel_system: replace values containing string pfi to 1 else all values to 0.
(v) For engine_type: replace values containing string ohc to 1 else all values to 0.
4. Divide the dataset into input features (all columns except price) and output variable
(price). Scale all input features.
5. Train a linear regressor on 70% of data (using inbuilt linear regression function of
Python) and test its performance on remaining 30% of data.
6. Reduce the dimensionality of the feature set using inbuilt PCA decomposition and then
again train a linear regressor on 70% of reduced data (using inbuilt linear regression
function of Python). Does it lead to any performance improvement on test set? 

In [33]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from word2number import w2n
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

df = pd.read_csv('./imports-85.data')
df.columns = ["symboling", "normalized_losses","make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels","engine_location", "wheel_base", "length", "width", "height", "curb_weight","engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke","compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
df = df.replace("?","NaN")
df = df[df.price!="NaN"]

imputer = SimpleImputer(missing_values = "NaN",strategy = "most_frequent")
imputer = imputer.fit(df)
df= imputer.transform(df)
df = pd.DataFrame(df)
df.columns = ["symboling", "normalized_losses","make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels","engine_location", "wheel_base", "length", "width", "height", "curb_weight","engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke","compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

for i in range(0,df.shape[0]):
    df["num_doors"][i]=w2n.word_to_num(df["num_doors"][i])
for i in range(0,df.shape[0]):
    df["num_cylinders"][i]=w2n.word_to_num(df["num_cylinders"][i])
    
encoded_body=pd.get_dummies(df['body_style'])
encoded_wheels=pd.get_dummies(df['drive_wheels'])
df=df.drop(['body_style'],axis=1)
df=df.drop(['drive_wheels'],axis=1)
frames = [df, encoded_body]
  
df = pd.concat(frames,axis=1)
label_encoder = LabelEncoder()
df["make"]=label_encoder.fit_transform(df["make"])
df["aspiration"]=label_encoder.fit_transform(df["aspiration"])
df["engine_location"]=label_encoder.fit_transform(df["engine_location"])
df["fuel_type"]=label_encoder.fit_transform(df["fuel_type"])

for i in range(0,df.shape[0]):
    if df["fuel_system"][i].find("pfi")!=-1:
        df["fuel_system"][i]=1
    else:
        df["fuel_system"][i]=0
        
for i in range(0,df.shape[0]):
    if df["engine_type"][i].find("ohc")!=-1:
        df["engine_type"][i]=1
    else:
        df["engine_type"][i]=0
        
x = df.drop(["price"],axis=1)
y = df[["price"]]

scaler = MinMaxScaler()

x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)
y = pd.DataFrame(scaler.fit_transform(y), columns=y.columns)

model=LinearRegression()
X_train,X_test,Y_train,Y_test=train_test_split(x,y,train_size=0.7,random_state=0)
final_model=model.fit(X_train,Y_train)
Y_predicted_final=model.predict(X_test)
print(mean_absolute_error(Y_test,Y_predicted_final))

pca = PCA(n_components = 20)
pca.fit(x)
x = pca.transform(x)
x = pd.DataFrame(x)
model=LinearRegression()
X_train,X_test,Y_train,Y_test=train_test_split(x,y,train_size=0.7,random_state=0)
final_model=model.fit(X_train,Y_train)
Y_predicted_final=model.predict(X_test)
print(mean_absolute_error(Y_test,Y_predicted_final))

0.07090671807263883
0.07206866300682549
