# Feature Engineering and Model Building - PredictStartupProfits

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#create data drame to read data set
df = pd.read_csv('50_Startups.csv')

In [3]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# check the df structe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [5]:
# describe df numerical columns
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


# Feature Engineering

- Drop unwanted Features
- Handle Missing Values
- Handle Categorical Features
- Handle Feature Scalling
- Remove Outliers

As per Exploratory Data Analysis EDA, 
- these is no unwanted column present in given dataset to remove
- no missing value found
- only one feature that is State is categorical and it has 3 unique values
- it seems no outliers found

In [6]:
# Handle Categorical Features
state_dummies = pd.get_dummies(df['State'],prefix='state',drop_first=True)
df = pd.concat([df,state_dummies],axis=1)

In [7]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,state_Florida,state_New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,1
1,162597.7,151377.59,443898.53,California,191792.06,0,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,1
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0


In [8]:
df.drop(['State'], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,state_Florida,state_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


# Split Dataset into Training set and Test set

In [10]:
X = df.drop(['Profit'],axis=1)
y = df['Profit']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [11]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,state_Florida,state_New York
33,55493.95,103057.49,214634.81,1,0
35,46014.02,85047.44,205517.64,0,1
26,75328.87,144135.98,134050.07,1,0
34,46426.07,157693.92,210797.67,0,0
18,91749.16,114175.79,294919.57,1,0


In [14]:
len(X_train)

40

In [15]:
len(X_test)

10

# Model Selection

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

In [25]:
def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'fit_intercept': [True, False],
                'normalize': [True, False],
                'copy_X': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [26]:
pd.set_option('display.max_colwidth', 100)
find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.922076,"{'copy_X': True, 'fit_intercept': True, 'normalize': True}"
1,lasso,0.922115,"{'alpha': 2, 'selection': 'random'}"
2,decision_tree,0.888233,"{'criterion': 'friedman_mse', 'splitter': 'random'}"


# Model Building

In [27]:
model_lr = LinearRegression(copy_X=True, fit_intercept=True, normalize=True)

In [28]:
model_lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [29]:
model_lr.score(X_test,y_test)

0.9347068473282423

In [30]:
y_pred= model_lr.predict(X_test)

In [31]:
y_pred

array([103015.20159796, 132582.27760816, 132447.73845174,  71976.09851258,
       178537.48221055, 116161.24230165,  67851.69209676,  98791.73374687,
       113969.43533012, 167921.0656955 ])

In [32]:
y_test

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64

In [36]:
X.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'state_Florida',
       'state_New York'],
      dtype='object')

In [38]:
np.where(X.columns=='state_'+str('New York'))[0][0]

4

In [40]:
def predict_profit(r_d_expenses,administration_expenses,marketing_expenses,state):    
    state_index = np.where(X.columns=='state_'+str(state))[0][0]

    x = np.zeros(len(X.columns))
    x[0] = r_d_expenses
    x[1] = administration_expenses
    x[2] = marketing_expenses
    if state_index >= 0:
        x[state_index] = 1

    return model_lr.predict([x])[0]

In [41]:
predict_profit(55493.95,103057.49,214634.81,'New York')

97423.07641753659

# Save Model

In [43]:
import pickle
pickle.dump(model_lr,open('profit_prediction_model.pkl','wb'))

In [44]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,state_Florida,state_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [45]:
#save the data columns
import json

columns = {'data_columns' : [col.lower() for col in X.columns]}

with open("columns.json","w") as f:
    f.write(json.dumps(columns))    