In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict, KFold #Splitting the data to training & testing set.
from sklearn.impute import SimpleImputer #Handling Null values
from sklearn.compose import ColumnTransformer #Column Transformer.
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler #Data Preprocessing
from sklearn.linear_model import LinearRegression #Linear Regression.
from sklearn.preprocessing import PolynomialFeatures #Polynomial Regression
from sklearn.svm import SVR #Support Vector Machines
from sklearn.tree import DecisionTreeRegressor #Decision Tress     
from sklearn.ensemble import RandomForestRegressor #Random Forest
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from math import sqrt

  from pandas import Int64Index as NumericIndex


In [50]:
data = pd.read_csv(r"D:\Data Science\Machine+Learning+A-Z+(Codes+and+Datasets)\Machine Learning A-Z (Codes and Datasets)\Part 2 - Regression\Section 5 - Multiple Linear Regression\Python\50_Startups.csv")
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [51]:
data.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [52]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [53]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop = 'first'), [3])], remainder='passthrough')
X = ct.fit_transform(X)

In [54]:
X[: 5]

array([[0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

In [55]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 42)
regressor_pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators = 10))

y_pred = cross_val_predict(regressor_pipeline, X, y, cv = cv)

In [56]:
print("RMSE: " + str(round(sqrt(mean_squared_error(y,y_pred)),2)))
print("R_squared: " + str(round(r2_score(y,y_pred),2)))

RMSE: 11153.89
R_squared: 0.92


In [57]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y.reshape(len(y),1)),1))

[[189865.712 192261.83 ]
 [179330.714 191792.06 ]
 [169070.98  191050.39 ]
 [181635.204 182901.99 ]
 [169217.804 166187.94 ]
 [160238.775 156991.12 ]
 [147566.19  156122.51 ]
 [150688.425 155752.6  ]
 [146976.746 152211.77 ]
 [148767.257 149759.96 ]
 [140557.623 146121.95 ]
 [137635.494 144259.4  ]
 [128400.372 141585.52 ]
 [132176.028 134307.35 ]
 [148674.491 132602.65 ]
 [141244.054 129917.04 ]
 [116478.647 126992.93 ]
 [134742.572 125370.37 ]
 [134655.783 124266.9  ]
 [129441.189 122776.86 ]
 [110108.784 118474.03 ]
 [123699.836 111313.02 ]
 [110126.979 110352.25 ]
 [107037.844 108733.99 ]
 [113453.61  108552.04 ]
 [100374.046 107404.34 ]
 [113471.391 105733.54 ]
 [110664.363 105008.31 ]
 [103530.405 103282.38 ]
 [105265.023 101004.64 ]
 [ 98801.588  99937.59 ]
 [ 99467.215  97483.56 ]
 [ 96565.841  97427.84 ]
 [100481.614  96778.92 ]
 [ 92299.875  96712.8  ]
 [ 93565.583  96479.51 ]
 [ 69027.738  90708.19 ]
 [ 87485.291  89949.14 ]
 [ 66722.267  81229.06 ]
 [ 88448.752  81005.76 ]


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                   random_state = 42,
                                                   shuffle = True)

In [60]:
X_train.shape, y_train.shape

((40, 5), (40,))

In [62]:
linear_regressor = LinearRegression(fit_intercept = True).fit(X_train, y_train)
linear_y_pred = linear_regressor.predict(X_test)

print("RMSE       : ",sqrt(mean_squared_error(y_test, linear_y_pred)))
print("R_squared  : ",r2_score(y_test, linear_y_pred))

RMSE       :  9055.957323497822
R_squared  :  0.8987266414319834


In [116]:
tree_regressor = DecisionTreeRegressor(criterion = "squared_error",
                                       max_features = "sqrt",
                                      min_samples_split=17, ccp_alpha = 0.00008).fit(X_train, y_train)
tree_y_pred = tree_regressor.predict(X_test)
print("RMSE       : ",sqrt(mean_squared_error(y_test, tree_y_pred)))
print("R_squared  : ",r2_score(y_test, tree_y_pred))

RMSE       :  13318.218512699052
R_squared  :  0.7809623604460533


In [117]:
forest_regressor = RandomForestRegressor(n_estimators = 10).fit(X_train, y_train)
forest_y_pred = forest_regressor.predict(X_test)
print("RMSE       : ",sqrt(mean_squared_error(y_test, forest_y_pred)))
print("R_squared  : ",r2_score(y_test, forest_y_pred))

RMSE       :  10976.725893987155
R_squared  :  0.851210557295637
