In [1]:
import pandas as pd

dataset = pd.read_csv('Datasets/50_Startups.csv')
dataset.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
29,65605.48,153032.06,107138.38,New York,101004.64
38,20229.59,65947.93,185265.1,New York,81229.06
11,100671.96,91790.61,249744.55,California,144259.4
32,63408.86,129219.61,46085.25,California,97427.84
6,134615.46,147198.87,127716.82,California,156122.51


In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [3]:
dataset['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [4]:
states = pd.get_dummies(dataset['State'], drop_first = True)

In [5]:
dataset.drop(['State'], axis = 1, inplace = True)

In [6]:
dataset = pd.concat([states, dataset], axis = 1)

In [7]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 0)

In [9]:
import statsmodels.api as sm

X = sm.add_constant(X)

In [10]:
multiple_regressor_OLS_1 = sm.OLS(endog = y_train, exog = X_train).fit()
multiple_regressor_OLS_1.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared (uncentered):,0.989
Model:,OLS,Adj. R-squared (uncentered):,0.988
Method:,Least Squares,F-statistic:,656.0
Date:,"Tue, 07 Jul 2020",Prob (F-statistic):,1.56e-33
Time:,13:04:52,Log-Likelihood:,-432.43
No. Observations:,40,AIC:,874.9
Df Residuals:,35,BIC:,883.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Florida,530.1647,5269.683,0.101,0.920,-1.02e+04,1.12e+04
New York,4999.3655,4661.687,1.072,0.291,-4464.362,1.45e+04
R&D Spend,0.6845,0.068,10.001,0.000,0.546,0.823
Administration,0.3418,0.036,9.625,0.000,0.270,0.414
Marketing Spend,0.0736,0.024,3.122,0.004,0.026,0.121

0,1,2,3
Omnibus:,2.176,Durbin-Watson:,1.961
Prob(Omnibus):,0.337,Jarque-Bera (JB):,1.675
Skew:,-0.501,Prob(JB):,0.433
Kurtosis:,2.963,Cond. No.,798000.0


In [None]:
X_features = X.columns.tolist()

In [29]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

def get_vif_factors(X):
    X_matrix = np.matrix(X)
    vif = [variance_inflation_factor( X_matrix, i ) for i in range(X_matrix.shape[1])]
    vif_factors = pd.DataFrame()
    vif_factors['column'] = X_features
    vif_factors['VIF'] = vif
    return vif_factors

In [31]:
vif_factors = get_vif_factors(X.values)
vif_factors

Unnamed: 0,column,VIF
0,const,26.600153
1,Florida,1.387641
2,New York,1.335061
3,R&D Spend,2.495511
4,Administration,1.177766
5,Marketing Spend,2.416797


In [36]:
columns_with_large_vif = vif_factors[vif_factors['VIF'] > 4].column
columns_with_large_vif

0    const
Name: column, dtype: object

In [38]:
X.corr()

Unnamed: 0,const,Florida,New York,R&D Spend,Administration,Marketing Spend
const,,,,,,
Florida,,1.0,-0.492366,0.105711,0.010493,0.205685
New York,,-0.492366,1.0,0.039068,0.005145,-0.03367
R&D Spend,,0.105711,0.039068,1.0,0.241955,0.724248
Administration,,0.010493,0.005145,0.241955,1.0,-0.032154
Marketing Spend,,0.205685,-0.03367,0.724248,-0.032154,1.0
