In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
df=pd.read_csv('/content/drive/MyDrive/DS360 Machine Learning/Course DSML/ML/Regression/regression_home_prices.csv')
df.head()

Unnamed: 0,area_sqr_ft,price_lakhs,bedrooms
0,656.0,39.0,2
1,1260.0,83.2,2
2,1057.0,86.6,3
3,1259.0,59.0,2
4,1800.0,140.0,3


In [4]:
df.shape

(200, 3)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

stdscl=StandardScaler()

def data_splitting(scaling=None):
  X=df.drop("price_lakhs", axis=1)
  y=df["price_lakhs"]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

  if scaling == True:
    X_train=stdscl.fit_transform(X_train)
    X_test=stdscl.transform(X_test)

  return X_train, X_test, y_train, y_test

In [7]:
from sklearn.metrics import mean_squared_error, r2_score

def classifiers(classifier, X_train, X_test, y_train, y_test):
  model=classifier.fit(X_train, y_train)
  print(f"Train Score: {model.score(X_train, y_train)}\n")
  prediction=model.predict(X_test)
  print(f"Mean Squared Error: {mean_squared_error(y_test, prediction)}\n")
  print(f"R2 Score: {r2_score(y_test, prediction)}\n")

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [9]:
lr=LinearRegression()
svm=SVR()
dt=DecisionTreeRegressor()

In [10]:
X_train, X_test, y_train, y_test = data_splitting(scaling=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160, 2), (40, 2), (160,), (40,))

In [11]:
classifiers(lr, X_train, X_test, y_train, y_test)

Train Score: 0.8940849291817146

Mean Squared Error: 50.94469128703867

R2 Score: 0.8553362627729599



In [12]:
classifiers(svm, X_train, X_test, y_train, y_test)

Train Score: 0.8252860938915973

Mean Squared Error: 66.80760182157867

R2 Score: 0.810291570907125



In [13]:
classifiers(dt, X_train, X_test, y_train, y_test)

Train Score: 0.9731180872935876

Mean Squared Error: 92.31376222363946

R2 Score: 0.7378636811141561



In [14]:
X_train, X_test, y_train, y_test = data_splitting()
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160, 2), (40, 2), (160,), (40,))

In [15]:
classifiers(lr, X_train, X_test, y_train, y_test)

Train Score: 0.8940849291817146

Mean Squared Error: 50.94469128703868

R2 Score: 0.8553362627729599



In [16]:
classifiers(svm, X_train, X_test, y_train, y_test)

Train Score: 0.17988871572215237

Mean Squared Error: 383.1144430333644

R2 Score: -0.0879007353797252



In [17]:
classifiers(dt, X_train, X_test, y_train, y_test)

Train Score: 0.9731180872935876

Mean Squared Error: 92.31376222363946

R2 Score: 0.7378636811141561



In [18]:
X_train, X_test, y_train, y_test = data_splitting(scaling=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160, 2), (40, 2), (160,), (40,))

In [19]:
from sklearn.ensemble import VotingRegressor

In [20]:
voting_clf=VotingRegressor(
    estimators=[('lr', LinearRegression()), ('svm1', SVR()), ('svm2', SVR(kernel='linear')),
                ('dt1', DecisionTreeRegressor()), ('dt2', DecisionTreeRegressor(criterion='friedman_mse'))]
)

voting_clf.fit(X_train, y_train)
voting_clf.score(X_train, y_train)

0.9413649209492645

In [21]:
prediction=voting_clf.predict(X_test)
print(f"Mean Squared Error: {mean_squared_error(y_test, prediction)}\n")
print(f"R2 Score: {r2_score(y_test, prediction)}\n")

Mean Squared Error: 58.86052011935912

R2 Score: 0.8328582900303048



In [22]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=6)

In [23]:
classifiers(rf, X_train, X_test, y_train, y_test)

Train Score: 0.9493489224436428

Mean Squared Error: 70.66845683998365

R2 Score: 0.7993281966723628



In [24]:
from xgboost import XGBRegressor

xgb=XGBRegressor()

In [25]:
classifiers(xgb, X_train, X_test, y_train, y_test)

Train Score: 0.9729240399893642

Mean Squared Error: 87.37235741133664

R2 Score: 0.7518954098230761



In [26]:
from lightgbm import LGBMRegressor

lgbm=LGBMRegressor()

In [27]:
classifiers(lgbm, X_train, X_test, y_train, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 160, number of used features: 2
[LightGBM] [Info] Start training from score 79.130625
Train Score: 0.9008294431957051

Mean Squared Error: 53.824508819082396

R2 Score: 0.8471586655357893

