# Day 3
1. Preprocess data
* Import Libs
* Import Dataset
* Check Missing data
* Encode Categorical Data 
* Make dummy variables 
* Feature scaling


In [6]:
import numpy as np 
import pandas as pd 

data = pd.read_csv("https://raw.githubusercontent.com/Avik-Jain/100-Days-Of-ML-Code/master/datasets/50_Startups.csv", delimiter=',')
data.isnull().any() # No missing data
data.rename(columns= {'R&D Spend':'R_D', 'Marketing Spend' : 'Marketing'}, inplace = True)
print(data.corr())
data.head()

                     R_D  Administration  Marketing    Profit
R_D             1.000000        0.241955   0.724248  0.972900
Administration  0.241955        1.000000  -0.032154  0.200717
Marketing       0.724248       -0.032154   1.000000  0.747766
Profit          0.972900        0.200717   0.747766  1.000000


Unnamed: 0,R_D,Administration,Marketing,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [8]:
import statsmodels.formula.api as sm 

model = sm.ols('Profit ~ R_D + Marketing + Administration', data).fit()
print(model.summary2())

                    Results: Ordinary least squares
Model:                 OLS               Adj. R-squared:      0.948     
Dependent Variable:    Profit            AIC:                 1058.7715 
Date:                  2021-03-25 12:22  BIC:                 1066.4196 
No. Observations:      50                Log-Likelihood:      -525.39   
Df Model:              3                 F-statistic:         296.0     
Df Residuals:          46                Prob (F-statistic):  4.53e-30  
R-squared:             0.951             Scale:               8.5236e+07
------------------------------------------------------------------------
                 Coef.     Std.Err.    t    P>|t|    [0.025     0.975]  
------------------------------------------------------------------------
Intercept      50122.1930 6572.3526  7.6262 0.0000 36892.7333 63351.6526
R_D                0.8057    0.0451 17.8464 0.0000     0.7148     0.8966
Marketing          0.0272    0.0165  1.6551 0.1047    -0.0059     0.0603

In [10]:
model_rd = sm.ols('Profit ~ R_D', data).fit()
print('R2 = ',model_rd.rsquared)
print('RSS = ', ( ( data.Profit - model_rd.predict() ) **2).sum())

R2 =  0.9465353160804393
RSS =  4256046566.3534527


In [12]:
model_rd_mark = sm.ols('Profit ~ R_D + Marketing ', data).fit()
print('R2 = ',model_rd_mark.rsquared)
print('RSS = ', ( ( data.Profit - model_rd_mark.predict() ) **2).sum())

R2 =  0.9504503015559763
RSS =  3944394850.324378


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()

cat_feature_mask = data.dtypes == object
categorical_columns = data.columns[cat_feature_mask].tolist()
data[categorical_columns] = data[categorical_columns].apply(lambda col: le.fit_transform(col))
data[categorical_columns].head()  

0
1
2
3
4


In [44]:
from sklearn.compose import ColumnTransformer
X, y = data.iloc[:,:-1].values, data.iloc[:,-1:].values

ct = ColumnTransformer([('oh_enc', OneHotEncoder(sparse=False), [3]), ], remainder='passthrough')
X = ct.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=0)

In [45]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)


In [46]:
from sklearn import metrics

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))


Mean Squared Error:  83502864.03256583
Mean Absolute Error:  7514.2936596433165
