# Multiple Linear Regression

## Importing the libraries

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [10]:
dataset = pd.read_csv("50_Startups.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [11]:
x

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

## Encoding categorical data

In [12]:
# Import the necessary libraries for column transformation and one-hot encoding.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Create a ColumnTransformer instance, 'ct,' to apply specific transformations to different columns.
# In this case, it's configured to apply the 'OneHotEncoder' transformation to column 0 (categorical data),
# while leaving the remaining columns unchanged ('passthrough').

ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [3])],
    remainder='passthrough'
)

# Transform the input data 'x' using the configured column transformations.
# The 'fit_transform' method applies one-hot encoding to the specified column (column 0) and keeps the rest unchanged.
x = np.array(ct.fit_transform(x))

## Splitting the dataset into the Training set and Test set

In [13]:
# Import the train_test_split function from scikit-learn.
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% training, 20% testing) with a fixed random state.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


## Training the Multiple Linear Regression model on the Training set

In [14]:
# Import the LinearRegression class from scikit-learn.
from sklearn.linear_model import LinearRegression

# Create a LinearRegression model instance.
lr = LinearRegression()

# Train the LinearRegression model on the training data.
lr.fit(x_train, y_train)

LinearRegression()

## Predicting the Test set results

In [15]:
# Use the trained LinearRegression model (lr) to predict y values for the test dataset (x_test).
y_pred = lr.predict(x_test)

# Set the print options for numpy to display two decimal places.
np.set_printoptions(precision=2)

# Concatenate and print the predicted values (y_pred) and the actual test values (y_test) for comparison.
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


In [16]:
# Import necessary metrics for evaluation
from sklearn.metrics import mean_squared_error, r2_score

# Calculate Mean Squared Error (MSE) between true and predicted values
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (Coefficient of Determination) between true and predicted values
r2 = r2_score(y_test, y_pred)

# Print the calculated metrics
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 83502864.03263049
R-squared: 0.934706847328201
