In [1]:
import pandas as pd

In [8]:
dataset = pd.read_csv('50_Startups.csv')

In [9]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [11]:
# this a filter method under feature selection
dataset.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [15]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [17]:
# by looking at above correlation we can identify which columns are more useful for predicting
# it is an example of multi linear regression 

In [18]:
X = dataset[['R&D Spend', 'Administration', 'Marketing Spend', 'State']]

In [20]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [23]:
y = dataset['Profit']

In [24]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [25]:
# we predict y with help of X

In [26]:
states = X['State']

In [29]:
states.head() # state is a categorical variable

0      New York
1    California
2       Florida
3      New York
4       Florida
Name: State, dtype: object

In [31]:
# one hot encoding 
state_dummy = pd.get_dummies(states)

In [33]:
state_dummy.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [34]:
state_dummy.shape

(50, 3)

In [36]:
# as one of 3 state we can eliminate ....it was just redundant
final_states = state_dummy.iloc[: , 0:2]

In [37]:
final_states.head()

Unnamed: 0,California,Florida
0,0,0
1,1,0
2,0,1
3,0,0
4,0,1


In [39]:
X.head() # we can remove states columnn from it and can add final_states

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [49]:
X = X.iloc[: , 0:3]

In [50]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [51]:
X[['California','Florida']] = final_states

In [52]:
X.head() # we replaced states from california and florida 

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,Florida
0,165349.2,136897.8,471784.1,0,0
1,162597.7,151377.59,443898.53,1,0
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,0,0
4,142107.34,91391.77,366168.42,0,1


In [54]:
y.head() # profit ...we want to find where to invest more/less to have good profit

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [56]:
# now we should divide data into train and test our mode

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [59]:
from sklearn.linear_model import LinearRegression

In [60]:
model = LinearRegression()

In [61]:
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [62]:
y_pred = model.predict(X_test)

In [63]:
y_pred

array([126464.76354541,  85538.22928102,  99367.07880808,  45864.36860436,
       128321.87763055,  51947.22582092, 108901.08466239, 100508.67082734,
        97668.72028978, 112423.7131146 , 128907.87800423, 174736.83458719,
        93516.90479133])

In [67]:
y_test

13    134307.35
39     81005.76
30     99937.59
45     64926.08
17    125370.37
48     35673.41
26    105733.54
25    107404.34
32     97427.84
19    122776.86
12    141585.52
4     166187.94
37     89949.14
Name: Profit, dtype: float64

In [None]:
# from above we can compare our values of y_pred(predicted profit) to y_test(actual profit)

In [65]:
model.coef_

array([ 8.10782371e-01, -8.99234656e-02,  2.99416290e-02,  3.37277505e+02,
        6.12629585e+02])

In [66]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,Florida
0,165349.2,136897.8,471784.1,0,0
1,162597.7,151377.59,443898.53,1,0
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,0,0
4,142107.34,91391.77,366168.42,0,1


In [68]:
type(X) # show the type of X

pandas.core.frame.DataFrame

In [69]:
X.iloc[1]

R&D Spend          162597.70
Administration     151377.59
Marketing Spend    443898.53
California              1.00
Florida                 0.00
Name: 1, dtype: float64

In [70]:
X.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
California           uint8
Florida              uint8
dtype: object