In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r"C:\Users\nisho\Documents\nishoak docs\Studies\Machine Learning\Coding_Part\03_Linear_Regression\02_Startups.csv")
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [4]:
df.isnull().sum() # No null values are present

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
df.value_counts()

R&D Spend  Administration  Marketing Spend  State       Profit   
0.00       116983.80       45173.06         California  14681.40     1
101913.08  110594.11       229160.95        Florida     146121.95    1
76253.86   113867.30       298664.47        California  118474.03    1
77044.01   99281.34        140574.81        New York    108552.04    1
78013.11   121597.55       264346.06        California  126992.93    1
78389.47   153773.43       299737.29        New York    111313.02    1
86419.70   153514.11       0.00             New York    122776.86    1
91749.16   114175.79       294919.57        Florida     124266.90    1
91992.39   135495.07       252664.93        California  134307.35    1
93863.75   127320.38       249839.44        Florida     141585.52    1
94657.16   145077.58       282574.31        New York    125370.37    1
100671.96  91790.61        249744.55        California  144259.40    1
114523.61  122616.84       261776.23        New York    129917.04    1
0.00       

In [6]:
X = df.iloc[ : , :-1]
y = df.iloc[: , -1]

In [7]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [8]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

### Encoding the categorical data
##### We can directly do that by,
##### from sklearn.compose import ColumnTransformer
##### from sklearn.preprocessing import OneHotEncoder
##### ct = ColumnTransformer( transformers = [( 'encoder' , OneHotEncoder() , [3] ) ]  , remainder = 'passthrough')
##### X = np.array(ct.fit_transform(X))

In [9]:
# Here state is a categorical feature. We can use one hot encoding to encode that feature.
states = pd.get_dummies(X['State'] , drop_first= True)
# To avoid dummy variable trap we use drop_first = True

In [10]:
X=X.drop('State',axis=1) # Dropping the state column
X=pd.concat([X,states],axis=1) # Concating the dummy variables

In [11]:
X.head() # We can see that 2 new columns have been added

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


### Splitting the dataset 

In [12]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split ( X , y , test_size = 0.25 , random_state = 0 )

In [13]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
34,46426.07,157693.92,210797.67,0,0
18,91749.16,114175.79,294919.57,1,0
7,130298.13,145530.06,323876.68,1,0
14,119943.24,156547.42,256512.92,1,0
45,1000.23,124153.04,1903.93,0,1


In [14]:
X_test.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
28,66051.52,182645.56,118148.2,1,0
11,100671.96,91790.61,249744.55,0,0
10,101913.08,110594.11,229160.95,1,0
41,27892.92,84710.77,164470.71,1,0
2,153441.51,101145.55,407934.54,1,0


In [15]:
y_train.head()

34     96712.80
18    124266.90
7     155752.60
14    132602.65
45     64926.08
Name: Profit, dtype: float64

In [16]:
y_test.head()

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
Name: Profit, dtype: float64

### Fitting multiple linear regression to training set

In [17]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train , y_train)

LinearRegression()

In [18]:
y_predicted = lr.predict(X_test)
y_predicted

array([104440.72582616, 132253.81567698, 132872.07174272,  71707.78771428,
       178678.9934985 , 115078.13124331,  66093.9297267 ,  98759.7276409 ,
       114113.5994088 , 167979.49411476,  95786.77295823,  87785.3497979 ,
       110455.98228344])

In [19]:
lr.predict([[66050 , 182650 , 118150 , 1 , 0]])



array([104439.80243751])

### Calculating the r2 score. If r2 score is nearer to 1 then the model is said to be a good model.

In [20]:
from sklearn.metrics import r2_score
score = r2_score(y_test , y_predicted)
score

0.931581519915206