# Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Dataset import

In [3]:
df = np.round(pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)
df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


## Select column

In [4]:
df = df.iloc[:,0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


## Add some random missing values

In [5]:
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[1,0] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[3,1] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1,-1] = np.NaN


In [6]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


# Step 1: Fill by mean

In [7]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [10]:
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


# Step 2: Remove col1 imputed mean value

In [12]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


# Step 3: extract train test split

In [13]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [14]:
y = df1.iloc[[0,2,3,4],0]
y

Unnamed: 0,R&D Spend
21,8.0
2,15.0
14,12.0
44,2.0


## Apply ML algorithm Linear Regression

In [16]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))



array([23.14158651])

# Step 3: Impute regression value into `nan` column

In [17]:
df1.iloc[1,0] = 23.14
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


# Step 4: Move to the next column 2

In [18]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


# Step 5: Train test

In [20]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
y = df1.iloc[[0,1,2,4],1]

# Step 6: ML algo for 2nd column

In [21]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([11.06331285])

## Predicted value impute

In [22]:
df1.iloc[3,1] = 11.06
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


# Step 7: start with new column

In [23]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


#Step 8: train test split

In [24]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
y = df1.iloc[0:4,-1]
y

Unnamed: 0,Marketing Spend
21,30.0
37,20.0
2,41.0
14,26.0


# Step 9: Apply ML algo

In [25]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([31.56351448])

# Step 10: Impute ML algo's value

In [26]:
df1.iloc[4,-1] = 31.56

# Step 11: Subtract `Base df` and `Predicted df`

In [27]:
# Subtract 0th iteration from 1st iteration
df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


# Fill values are not close to zero. so repeat same procedure