In [None]:
#Goal: Create a model that can predict the profit of the company based on company's spending pattern and company's location

In [None]:
import pandas as pd
import numpy as np


In [None]:
data = pd.read_csv("50_Startups.csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
#Seperate data as features and label

features = data.iloc[:,[0,1,2,3]].values
label = data.iloc[:,[4]].values

In [None]:
#OHE for State column

from sklearn.preprocessing import OneHotEncoder
oheState = OneHotEncoder(sparse_output=False)
stateDummy = oheState.fit_transform(features[:,[3]])

In [None]:
finalFeatureSet = np.concatenate((stateDummy,features[:,[0,1,2]]), axis=1)
finalFeatureSet

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

In [None]:
#Model Exploration Phase -------- Dr AI Strange

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

CL = 0.9

for seed in range(1,100):
  X_train,X_test,y_train,y_test = train_test_split(finalFeatureSet,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=seed)
  model = LinearRegression()
  model.fit(X_train,y_train)
  trainScore = model.score(X_train,y_train)
  testScore = model.score(X_test,y_test)

  if testScore > trainScore and testScore >= CL:
    print(f"Test Score is {testScore} and Train Score is {trainScore} for Random Seed {seed}")



Test Score is 0.9649618042060378 and Train Score is 0.942446542689397 for Random Seed 1
Test Score is 0.9783259006626532 and Train Score is 0.9398417195515446 for Random Seed 2
Test Score is 0.9560357304860488 and Train Score is 0.9473848999820091 for Random Seed 4
Test Score is 0.9669763022158507 and Train Score is 0.9438505226429931 for Random Seed 5
Test Score is 0.9901105113397705 and Train Score is 0.9385918220043519 for Random Seed 10
Test Score is 0.9726607102794014 and Train Score is 0.9411603359254431 for Random Seed 14
Test Score is 0.9633877651309604 and Train Score is 0.946138584319559 for Random Seed 21
Test Score is 0.975790639498154 and Train Score is 0.9425908513252553 for Random Seed 22
Test Score is 0.9687727807395896 and Train Score is 0.9464972114069966 for Random Seed 24
Test Score is 0.9602561948870856 and Train Score is 0.9454518446256155 for Random Seed 26
Test Score is 0.9500997612784601 and Train Score is 0.9482961316721963 for Random Seed 29
Test Score is 0.9

In [None]:
#Split the data as training set and test set
#In sklearn we have a class called train_test_split
#
# train_test_split() returns 4 variables
# X_train ---- training feature set
# X_test  ---- testing feature set
# y_train ---- training label set
# y_test  ---- testing label set

# training pair ---- (X_train,y_train)
# testing pair  ---- (X_test,y_test)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(finalFeatureSet,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=10)

#Total Records in my Dataset is 100
# 80% 20% split
# 80 records ----> training set
# 20 records ----> testing set

In [None]:
#Setup Algorithm -----> LinearRegression

from sklearn.linear_model import LinearRegression
model = LinearRegression()


In [None]:
#Training Starts
#fit(featureData,labelData)

model.fit(X_train,y_train)

In [None]:
#
# Check the quality of the model
#
# SL (Significance Level) ------- Standard Alpha Values --------- (0.05,0.01,0.1)
#
# SL = 0.05
# CL = 1 - SL = 1 - 0.05 = 0.95 (The project is expecting atleast 95% accuracy in prediction
#
# When it comes to approving the model, always go for GENERALIZED MODEL
#
# Definition: Generalized model is a trained model that not only performs BEST on KNOWN data, but also PERFORMS BEST on UNKNOWN DATA.
#
#=================================================================================
# Guideline to identify the Generalized model (By Prashant Nair)
#=================================================================================
#
# a. Calc the evaluation score of the model for both training data and testing data
#
#        trainScore (Score generated for training Data)
#        testScore  (Score generated for testing data)
#
# b. Criteria by PN
#
#        if testScore > trainScore and testScore >= CL:
#                              Approve the model
#        else:
#                              Reject the model and start the process of improving the model
#
#
# For Regression score ---> r2 (Metric function)   | MSE(Mean Squared Error) or MAE(Mean Absolute Error) ---- (Error Function)

In [None]:
trainScore = model.score(X_train,y_train)
testScore = model.score(X_test,y_test)

In [None]:
print(f"Test Score is {testScore} and Train Score is {trainScore}")

Test Score is 0.9901105113397705 and Train Score is 0.9385918220043519


In [None]:
data.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [None]:
oheState.categories_[0]

array(['California', 'Florida', 'New York'], dtype=object)

In [None]:
oheState.transform(np.array([["California"]]))

array([[1., 0., 0.]])

In [None]:
#Test APP

rdSpend = float(input("Enter RD Spend:"))
adminSpend = float(input("Enter Admin Spend: "))
markSpend = float(input("Enter Marketing Spend: "))
state = input("Enter State: ")


if state in oheState.categories_[0]:
  dummyState = oheState.transform(np.array([[state]]))

  finalFeatureInput = np.concatenate( (dummyState,np.array([[rdSpend,adminSpend,markSpend]])) , axis = 1)

  profit = model.predict(finalFeatureInput)

  print(f"Predicted profit is $ {profit}")
else:
  print(f"{state} state is not recognized by the model")




Enter RD Spend:454533
Enter Admin Spend: 54454545
Enter Marketing Spend: 232343435
Enter State: Florida
Predicted profit is $ [[4739400.02246392]]


In [None]:
import pickle
pickle.dump(model,open("ProfitPredictor.mdl","wb"))
pickle.dump(oheState, open("StateEncoderDecoder.mdl","wb"))