In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression  
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Reading the dataset
From .csv file, using the library pandas

In [None]:
dataset = pd.read_csv('50_Startups.csv')
print(dataset)

    R&D Spend  Administration  Marketing Spend       State     Profit
0   165349.20       136897.80        471784.10    New York  192261.83
1   162597.70       151377.59        443898.53  California  191792.06
2   153441.51       101145.55        407934.54     Florida  191050.39
3   144372.41       118671.85        383199.62    New York  182901.99
4   142107.34        91391.77        366168.42     Florida  166187.94
5   131876.90        99814.71        362861.36    New York  156991.12
6   134615.46       147198.87        127716.82  California  156122.51
7   130298.13       145530.06        323876.68     Florida  155752.60
8   120542.52       148718.95        311613.29    New York  152211.77
9   123334.88       108679.17        304981.62  California  149759.96
10  101913.08       110594.11        229160.95     Florida  146121.95
11  100671.96        91790.61        249744.55  California  144259.40
12   93863.75       127320.38        249839.44     Florida  141585.52
13   91992.39       

# Converting the String type label to a numeric one
This can be done with a one-hot vector too, but also with the function factorize from pandas

In [None]:
dataset['State_Label'] = pd.factorize(dataset['State'])[0]
print(dataset)

    R&D Spend  Administration  ...     Profit State_Label
0   165349.20       136897.80  ...  192261.83           0
1   162597.70       151377.59  ...  191792.06           1
2   153441.51       101145.55  ...  191050.39           2
3   144372.41       118671.85  ...  182901.99           0
4   142107.34        91391.77  ...  166187.94           2
5   131876.90        99814.71  ...  156991.12           0
6   134615.46       147198.87  ...  156122.51           1
7   130298.13       145530.06  ...  155752.60           2
8   120542.52       148718.95  ...  152211.77           0
9   123334.88       108679.17  ...  149759.96           1
10  101913.08       110594.11  ...  146121.95           2
11  100671.96        91790.61  ...  144259.40           1
12   93863.75       127320.38  ...  141585.52           2
13   91992.39       135495.07  ...  134307.35           1
14  119943.24       156547.42  ...  132602.65           2
15  114523.61       122616.84  ...  129917.04           0
16   78013.11 

# Our input data is X
And it's consisted of the columns R&D Spend,Administration,Marketing Spend and State_Label

In [None]:
data_X = pd.DataFrame(dataset,columns=['R&D Spend','Administration','Marketing Spend','State_Label'])
X = data_X.values.tolist() 
print(X)

[[165349.2, 136897.8, 471784.1, 0.0], [162597.7, 151377.59, 443898.53, 1.0], [153441.51, 101145.55, 407934.54, 2.0], [144372.41, 118671.85, 383199.62, 0.0], [142107.34, 91391.77, 366168.42, 2.0], [131876.9, 99814.71, 362861.36, 0.0], [134615.46, 147198.87, 127716.82, 1.0], [130298.13, 145530.06, 323876.68, 2.0], [120542.52, 148718.95, 311613.29, 0.0], [123334.88, 108679.17, 304981.62, 1.0], [101913.08, 110594.11, 229160.95, 2.0], [100671.96, 91790.61, 249744.55, 1.0], [93863.75, 127320.38, 249839.44, 2.0], [91992.39, 135495.07, 252664.93, 1.0], [119943.24, 156547.42, 256512.92, 2.0], [114523.61, 122616.84, 261776.23, 0.0], [78013.11, 121597.55, 264346.06, 1.0], [94657.16, 145077.58, 282574.31, 0.0], [91749.16, 114175.79, 294919.57, 2.0], [86419.7, 153514.11, 0.0, 0.0], [76253.86, 113867.3, 298664.47, 1.0], [78389.47, 153773.43, 299737.29, 0.0], [73994.56, 122782.75, 303319.26, 2.0], [67532.53, 105751.03, 304768.73, 2.0], [77044.01, 99281.34, 140574.81, 0.0], [64664.71, 139553.16, 13796

# Our labels are Y
Which represents the profit of the startup

In [None]:
data_Y = pd.DataFrame(dataset,columns=['Profit'])
Y = data_Y.values.tolist() 
print(Y)

[[192261.83], [191792.06], [191050.39], [182901.99], [166187.94], [156991.12], [156122.51], [155752.6], [152211.77], [149759.96], [146121.95], [144259.4], [141585.52], [134307.35], [132602.65], [129917.04], [126992.93], [125370.37], [124266.9], [122776.86], [118474.03], [111313.02], [110352.25], [108733.99], [108552.04], [107404.34], [105733.54], [105008.31], [103282.38], [101004.64], [99937.59], [97483.56], [97427.84], [96778.92], [96712.8], [96479.51], [90708.19], [89949.14], [81229.06], [81005.76], [78239.91], [77798.83], [71498.49], [69758.98], [65200.33], [64926.08], [49490.75], [42559.73], [35673.41], [14681.4]]


# Getting the train and test data ready
Splitting it into 80:20 ratio

In [None]:
X_Train,X_Test,Y_Train,Y_Test = train_test_split(X,Y,test_size=0.2,shuffle=True)

print(X_Train)
print(Y_Train)
print(X_Test)
print(Y_Test)

[[1315.46, 115816.21, 297114.46, 2.0], [131876.9, 99814.71, 362861.36, 0.0], [22177.74, 154806.14, 28334.72, 1.0], [100671.96, 91790.61, 249744.55, 1.0], [55493.95, 103057.49, 214634.81, 2.0], [142107.34, 91391.77, 366168.42, 2.0], [114523.61, 122616.84, 261776.23, 0.0], [120542.52, 148718.95, 311613.29, 0.0], [86419.7, 153514.11, 0.0, 0.0], [65605.48, 153032.06, 107138.38, 0.0], [72107.6, 127864.55, 353183.81, 0.0], [27892.92, 84710.77, 164470.71, 2.0], [153441.51, 101145.55, 407934.54, 2.0], [76253.86, 113867.3, 298664.47, 1.0], [0.0, 135426.92, 0.0, 1.0], [130298.13, 145530.06, 323876.68, 2.0], [134615.46, 147198.87, 127716.82, 1.0], [91749.16, 114175.79, 294919.57, 2.0], [144372.41, 118671.85, 383199.62, 0.0], [77044.01, 99281.34, 140574.81, 0.0], [44069.95, 51283.14, 197029.42, 1.0], [38558.51, 82982.09, 174999.3, 1.0], [46426.07, 157693.92, 210797.67, 1.0], [28754.33, 118546.05, 172795.67, 1.0], [93863.75, 127320.38, 249839.44, 2.0], [67532.53, 105751.03, 304768.73, 2.0], [0.0, 1

# Decision Tree Regressor

In [None]:
DecisionTree = DecisionTreeRegressor(max_depth=3)
DecisionTree.fit(X_Train,Y_Train)
print(DecisionTree.score(X_Train,Y_Train))
print(DecisionTree.score(X_Test,Y_Test))

0.9696350670713659
0.8667241956659785


# Random Forest Regressor

In [None]:
RandomForest = RandomForestRegressor(n_estimators=3)
RandomForest.fit(X_Train, Y_Train)
print(RandomForest.score(X_Train, Y_Train))
print(RandomForest.score(X_Test, Y_Test))

0.9720952878543307
0.9667135056205008


  


# Linear Regressor

In [None]:
LinearRegress = LinearRegression()  
LinearRegress.fit(X_Train, Y_Train)

print(LinearRegress.score(X_Train, Y_Train))  
print(LinearRegress.score(X_Test, Y_Test)) 

0.9528204156338477
0.9285302168617353


As we can see from the results above, the Random Forest regressor gives the best results.