### A venture capitalist company has hired you as a ML engineer. Your role is to  create a model that can predict the profit of the company based on company's spending pattern and the location of the company

In [11]:
import numpy as np
import pandas as pd

In [12]:
data = pd.read_csv('50_Startups.csv')

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [14]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [15]:
#Seperate data as features and label
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values.reshape(-1,1)

In [16]:
features

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [17]:
#Deal with categorical data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
stateLabelEncoder = LabelEncoder()
features[:,3] = stateLabelEncoder.fit_transform(features[:,3])

In [19]:
stateOHE = OneHotEncoder(categorical_features=[3])
features = stateOHE.fit_transform(features).toarray()

In [21]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=1)

In [23]:
#Create the Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
#Quality
#1. Check for Generalization
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.9424465426893971
0.9649618042060633


In [28]:
#Deployment
#User Interaction Test
rdSpend = float(input("Enter R&D Spend: "))
admSpend = float(input("Enter Admin Spend: "))
markSpend = float(input("Enter Marketing Spend: "))
location = input("Enter Location: ")

if location in stateLabelEncoder.classes_:
    features = np.array([[rdSpend,admSpend,markSpend,location]])
    features[:,3] = stateLabelEncoder.transform(features[:,3])
    features= stateOHE.transform(features).toarray()
    profit = model.predict(features)
    print("Predicted Profit is {}".format(profit))
    
else:
    print("Model Cant predict for {} location ".format(location))
    


Enter R&D Spend: 234
Enter Admin Spend: 234
Enter Marketing Spend: 234
Enter Location: Mumbai
Model Cant predict for Mumbai location 


In [25]:
stateLabelEncoder.classes_

array(['California', 'Florida', 'New York'], dtype=object)