# Load Libraries

In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Read Data

In [112]:
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [113]:
df.shape

(50, 5)

# Check if null

In [114]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

# Managing Categorical Values

In [115]:
dummy = pd.get_dummies(df["State"])
df = pd.concat([dummy, df], axis=1)
df.drop(['State'], inplace=True, axis=1)
df.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


# Correlation

In [116]:
df.corr()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
California,1.0,-0.492366,-0.515152,-0.143165,-0.015478,-0.168875,-0.145837
Florida,-0.492366,1.0,-0.492366,0.105711,0.010493,0.205685,0.116244
New York,-0.515152,-0.492366,1.0,0.039068,0.005145,-0.03367,0.031368
R&D Spend,-0.143165,0.105711,0.039068,1.0,0.241955,0.724248,0.9729
Administration,-0.015478,0.010493,0.005145,0.241955,1.0,-0.032154,0.200717
Marketing Spend,-0.168875,0.205685,-0.03367,0.724248,-0.032154,1.0,0.747766
Profit,-0.145837,0.116244,0.031368,0.9729,0.200717,0.747766,1.0


In [117]:
# Split in X and Y
x_copy = df.copy()

x_copy.drop(['California', 'Florida', 'New York','Administration'], inplace=True, axis=1)
x_copy.head()

Unnamed: 0,R&D Spend,Marketing Spend,Profit
0,165349.2,471784.1,192261.83
1,162597.7,443898.53,191792.06
2,153441.51,407934.54,191050.39
3,144372.41,383199.62,182901.99
4,142107.34,366168.42,166187.94


In [118]:
X = x_copy.iloc[:, 0:2].values
y = x_copy.iloc[:, -1:].values
y.shape

(50, 1)

# Splliting the data

In [119]:
from sklearn.model_selection import train_test_split

In [120]:
X_train, X_test, y_train, Y_test = train_test_split(X, y, test_size=1/3, random_state=0)

In [121]:
print(X_train.shape)
print(Y_test.shape)

(33, 2)
(17, 1)


# Training the data

In [122]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [123]:
# Training the model
regressor.fit(X_train, y_train)

LinearRegression()

# Predict the data

In [124]:
y_pred = regressor.predict(X_test)

# Print Real and Predicted Values

In [125]:
Y_test.shape

(17, 1)

In [126]:
print("="*40)
print("Real Value ---------> Predicted Value")
print("="*40)
print("\n")

for item in range(len(Y_test)):
    print(str(Y_test[item]) + " --------> " + str(y_pred[item]))

Real Value ---------> Predicted Value


[103282.38] --------> [103066.92650993]
[144259.4] --------> [134980.77031452]
[146121.95] --------> [135442.08287513]
[77798.83] --------> [73055.77302514]
[191050.39] --------> [182480.8206228]
[105008.31] --------> [114370.99086191]
[81229.06] --------> [67338.79085538]
[97483.56] --------> [98232.53852252]
[110352.25] --------> [114571.23018341]
[166187.94] --------> [172068.27025499]
[96778.92] --------> [97021.48065448]
[96479.51] --------> [89009.06807808]
[105733.54] --------> [111096.46809524]
[96712.8] --------> [89489.12418883]
[124266.9] --------> [128889.90886756]
[155752.6] --------> [161252.36179763]
[132602.65] --------> [150951.30453297]


In [127]:
from sklearn.metrics import r2_score

In [128]:
r2_score(Y_test, y_pred)

0.928406870739455

In [131]:
X_test.shape

(17, 2)

In [134]:
new_array = np.array([[20000, 23456], [45678, 374855]])
regressor.predict(new_array)

array([[62784.13036156],
       [93303.51304046]])