### Predicting Profit Using ANN

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Dense
plt.style.use('seaborn')

In [5]:
# loading data
data = pd.read_csv('50_Startups.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [6]:
# data check for missing values
data.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [8]:
# check for duplicated records
data.duplicated().sum()

0

In [9]:
# check for zero values in input features
data.isin([0]).sum()

R&D Spend          2
Administration     0
Marketing Spend    3
State              0
Profit             0
dtype: int64

In [10]:
data['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [11]:
# split the data as X and y
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [12]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [13]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [14]:
# scaling the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X.iloc[:, :-1] = sc.fit_transform(X.iloc[:, :-1])
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,2.016411,0.560753,2.153943,New York
1,1.95586,1.082807,1.9236,California
2,1.754364,-0.728257,1.626528,Florida
3,1.554784,-0.096365,1.42221,New York
4,1.504937,-1.079919,1.281528,Florida


In [15]:
# getting dummies 
X = pd.get_dummies(data= X, columns= ['State'], drop_first= True)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,2.016411,0.560753,2.153943,0,1
1,1.95586,1.082807,1.9236,0,0
2,1.754364,-0.728257,1.626528,1,0
3,1.554784,-0.096365,1.42221,0,1
4,1.504937,-1.079919,1.281528,1,0


In [16]:
# train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [25]:
regressor = Sequential()
regressor.add(Dense(units = 8,kernel_initializer = 'normal', activation = 'relu', input_dim = 5))
regressor.add(Dense(units = 4,kernel_initializer = 'normal', activation = 'relu'))
regressor.add(Dense(units = 1, activation = 'linear'))
regressor.compile(loss = 'mean_squared_error', optimizer = 'adam')

In [26]:
regressor.fit(X_train, y_train, batch_size = 10, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x19ca0324c88>

In [27]:
y_pred = regressor.predict(X_test)

In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
print('mse {}'.format(mean_absolute_error(y_test, y_pred)), '\n')
print('rmse {}'.format(np.sqrt(mean_absolute_error(y_test, y_pred))))

mse 109111.0985074663 

rmse 330.31969137105085


In [22]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
y_pred_lr = lr.predict(X_test)

In [24]:
print('mse {}'.format(mean_absolute_error(y_test, y_pred_lr)), '\n')
print('rmse {}'.format(np.sqrt(mean_absolute_error(y_test, y_pred_lr))))

mse 2969.052767737901 

rmse 54.48901511073494
