## **Machine Learning model to predict startup company's profit using multiple linear regression**
  1. Importing needed libraries
  2. Data preprocessing
  3. Training multiple linear regression
  4. Predicting test results

In [1]:
# 1. importing neede libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 2. data preprocessing
# 2.1. importing needed dataset
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# 2.2. splitting dependent and independent variable
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [4]:
# 2.3. encoding categorical data
# since in 'state' column is categorical type of values we have to encode them.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [5]:
# 2.4. splitting dataset into train & test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
# 3. training multiple linear regression on training set
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
# 4. predicting test results.
y_pred = lr.predict(X_test)
y_pred

array([131314.10442079, 134956.8179492 , 165034.48348914,  52369.93239386,
       102087.70247816, 101447.90276632,  46095.32556763,  70004.66042286,
       116124.90784904, 151150.62311544])

In [8]:
# comparing predicted profit with original profit.
pd.concat([pd.DataFrame(y_test, columns=['y_test']), pd.DataFrame(y_pred, columns=['y_pred'])], axis=1)

Unnamed: 0,y_test,y_pred
0,125370.37,131314.104421
1,144259.4,134956.817949
2,156991.12,165034.483489
3,35673.41,52369.932394
4,99937.59,102087.702478
5,107404.34,101447.902766
6,42559.73,46095.325568
7,71498.49,70004.660423
8,110352.25,116124.907849
9,132602.65,151150.623115
