<a href="https://colab.research.google.com/github/peterlulu666/MachineLearningPython/blob/main/multiple%20linear%20regression/multiple_linear_regression_divide_whole_dataset_into_X_and_Y.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import library

In [None]:
import pandas as pd
import numpy as np

## Import data

In [None]:
df = pd.read_csv("50_Startups.csv")
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


## Data Preparation
  - Encodeing categorical data
    - The State column include the name of state 
    - In order to fit a regression line, we have to convert this column into numerical value

In [None]:
# Creating dummy variable
status = pd.get_dummies(df['State'])
status.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [None]:
# We have a categorical variable with n-levels
# We can build up ‘n-1’ variables, indicating the levels
# We can remove the first dummy variable and remove the State
status = pd.get_dummies(df['State'], drop_first = True)
df = pd.concat([df, status], axis = 1)
df.drop(['State'], axis = 1, inplace = True)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


## Splitting the dataset into the training set and test set

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 0)

In [None]:
df_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
33,55493.95,103057.49,214634.81,96778.92,1,0
35,46014.02,85047.44,205517.64,96479.51,0,1
26,75328.87,144135.98,134050.07,105733.54,1,0
34,46426.07,157693.92,210797.67,96712.8,0,0
18,91749.16,114175.79,294919.57,124266.9,1,0


In [None]:
df_test.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
28,66051.52,182645.56,118148.2,103282.38,1,0
11,100671.96,91790.61,249744.55,144259.4,0,0
10,101913.08,110594.11,229160.95,146121.95,1,0
41,27892.92,84710.77,164470.71,77798.83,1,0
2,153441.51,101145.55,407934.54,191050.39,1,0


In [None]:
# Dividing the training data set into X and Y
y_train = df_train.pop('Profit')
X_train = df_train

In [None]:
y_train.head()

33     96778.92
35     96479.51
26    105733.54
34     96712.80
18    124266.90
Name: Profit, dtype: float64

In [None]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
33,55493.95,103057.49,214634.81,1,0
35,46014.02,85047.44,205517.64,0,1
26,75328.87,144135.98,134050.07,1,0
34,46426.07,157693.92,210797.67,0,0
18,91749.16,114175.79,294919.57,1,0


In [None]:
# Dividing the test data set into X and Y
y_test = df_test.pop('Profit')
X_test = df_test

In [None]:
y_test.head()

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
Name: Profit, dtype: float64

In [None]:
X_test.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
28,66051.52,182645.56,118148.2,1,0
11,100671.96,91790.61,249744.55,0,0
10,101913.08,110594.11,229160.95,1,0
41,27892.92,84710.77,164470.71,1,0
2,153441.51,101145.55,407934.54,1,0


## Training the multiple linear regression model on the training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression().fit(X_train, y_train)
regressor

LinearRegression()

## Predicting the testing set

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
y_pred

array([103015.20159796, 132582.27760816, 132447.73845174,  71976.09851258,
       178537.48221055, 116161.24230165,  67851.69209676,  98791.73374687,
       113969.43533012, 167921.0656955 ])

In [None]:
y_test

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64

In [None]:
# Compare y_pred with y_test in one table
df_compare = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test})
df_compare

Unnamed: 0,Predicted,Actual
28,103015.201598,103282.38
11,132582.277608,144259.4
10,132447.738452,146121.95
41,71976.098513,77798.83
2,178537.482211,191050.39
27,116161.242302,105008.31
38,67851.692097,81229.06
31,98791.733747,97483.56
22,113969.43533,110352.25
4,167921.065696,166187.94
