In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 
from sklearn.linear_model import LinearRegression

In [2]:
dataset = pd.read_csv("startups.csv")

In [3]:
print(dataset.head(10))

   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94
5  131876.90        99814.71        362861.36    New York  156991.12
6  134615.46       147198.87        127716.82  California  156122.51
7  130298.13       145530.06        323876.68     Florida  155752.60
8  120542.52       148718.95        311613.29    New York  152211.77
9  123334.88       108679.17        304981.62  California  149759.96


In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [5]:
# Handle the categorical data in column 4
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
# Moves the encoded columns to the first colmns of the table

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
regressor = LinearRegression() # create instance of the model
regressor.fit(X_train, y_train) # Train the model on the training set 

In [8]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1)) # .reshape = reshape vectors (which is originally horizontal) to have len(y_pred) rows and 1 column (vertical) (i.e., transpose the vector)
# axis = 1 is a vertical concatenation 

[[102624.02 101004.64]
 [118071.66 111313.02]
 [126517.8  134307.35]
 [172604.51 166187.94]
 [ 64029.22  65200.33]
 [134122.08 144259.4 ]
 [192903.87 192261.83]
 [ 98076.83  97427.84]
 [111810.92 105733.54]
 [157248.77 156122.51]]


In [9]:
print(regressor.predict([[1, 0, 0, 160000, 130000, 300000]]))

[182572.22]


In [10]:
print(regressor.coef_)
print(regressor.intercept_)

[-1.18e+03  8.41e+02  3.40e+02  8.04e-01 -1.61e-02  2.69e-02]
49101.17317767538


********EVALUATE THE ACCURACY OF THE REGRESSION MODEL********

In [11]:
from sklearn.metrics import r2_score
print('r2 = ', r2_score(y_test, y_pred))

r2 =  0.9775412233459057
