In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression              

In [2]:
df = np.round(pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)
df = df.sample(5)
df

# Youâ€™re asking pandas to randomly pick 5 rows.
# Because you fixed the seed (9), it will always pick the same 5 rows every time you run.
# Without the seed, every run would give a different set of 5 rows.   

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [3]:
df = df.iloc[:,0:-1]  # remove the last column    
df   

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [4]:
# now, Assignig some missing values(NA) to the column

# structure is like:-  df.iloc[row_no, column_no]

df.iloc[1,0] = np.NaN     # means assigning NaN to 2nd row and 1st column
df.iloc[3,1] = np.NaN     # means assigning NaN to 4th row and 2nd column
df.iloc[-1,-1] = np.NaN   # means assigning NaN to the last row and last column

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[1,0] = np.NaN     # means assigning NaN to 2nd row and 1st column
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[3,1] = np.NaN     # means assigning NaN to 4th row and 2nd column
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1,-1] = np.NaN   # means assigning NaN to the last row and last column


In [5]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


In [6]:
 # Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame() 

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean()) 


In [7]:
# 0th iteration 

df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [8]:
# remove the column 1 imputed value 

df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [9]:
# use the first 3 rows to build a model and use the last for prediction 
# i.e.,we don't take that row and column where the 'NaN' value exists 
# now, we treat col2 and col3 as 'X' and col1 as 'y'

X = df1.iloc[[0,2,3,4],1:3]   
X 

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [10]:
# Now, take the left column (i.e.,col1) and treat it as 'y'

y = df1.iloc[[0,2,3,4],0]  # means we take row1,3,4,5 (not row2 b/c there is NaN value exits) and col1
y

21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [11]:
# now, we train our model and ask from the model to predict our missing value 

lr = LinearRegression()         

lr.fit(X,y)       
lr.predict(df1.iloc[1,1:].values.reshape(1,2))  # this gives our predictive value for our missing values

# df.iloc[1,1:] --> mtlb ye 'NaN' ka value batayega with the help of values present in that rows(in X_train)         

# iloc[1,1:] --> takes the row at index 1 and columns from index 1 onward.

# .reshape(1,2) --> Machine learning models expect input as 2D arrays: (n_samples, n_features) 
# so, [5.0, 20.0] becomes [[5.0, 20.0]]      

# .values --> Converts that pandas Series into a NumPy array.   



array([23.14158651])

In [12]:
# Now, assign this value again to the col1

df1.iloc[1,0] = 23.14

In [13]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [14]:
# Now, it's the time for col2
# do the same operation for col2 and predict missing value of that column

# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [15]:
# Now, delete that cloumn and row where that 'NaN' value exits 

X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [16]:
# And, take the rest column (i.e.,col2) for prediction 

y = df1.iloc[[0,1,2,4],1]  
y

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [17]:
# Now, apply here Linear Regression to train your model and predict value

lr = LinearRegression()

lr.fit(X,y)
lr.predict(df.iloc[3,[0,2]].values.reshape(1,2))



array([11.06331285])

In [18]:
df1.iloc[3,1] = 11.06

In [19]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


In [20]:
df1.iloc[4,-1] = np.NaN

In [21]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


In [22]:
X = df1.iloc[[0,1,2,3], [0,1]]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [23]:
y = df.iloc[[0,1,2,3], 2]
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [24]:
# Apply linear regression to predict the missing value(NaN)

lr = LinearRegression()

lr.fit(X,y)
lr.predict(df.iloc[4,[0,1]].values.reshape(1,2))



array([31.56351448])

In [25]:
df1.iloc[4,-1] = 31.56

In [26]:
# After 1st derivation

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [27]:
# Subtract (1st iteration - 0th iteration)

df1 - df0

# we are fining this iteration until all will not become equal to 0 (or approx to it)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


In [28]:
df2 = df1.copy()

In [29]:
# Now, start doing the same operation on 'df2' DataFrame

df2.iloc[1,0] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [30]:
X = df2.iloc[[0,2,3,4],1:]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df.iloc[1,1:].values.reshape(1,2))



array([23.78627207])

In [31]:
df2.iloc[1,0] = 23.78
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [32]:
df2.iloc[3,1] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,31.56


In [33]:
X = df2.iloc[[0,1,2,4], [0,2]]
y = df2.iloc[[0,1,2,4], 1]

lr = LinearRegression()

lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([11.22020174])

In [34]:
df2.iloc[3,1] = 11.22

In [35]:
df2.iloc[4,-1] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,


In [36]:
X = df2.iloc[[0,1,2,3], [0,1]]
y = df2.iloc[[0,1,2,3], 2]

lr = LinearRegression()

lr.fit(X,y)
lr.predict(df2.iloc[4, [0,1]].values.reshape(1,2))



array([38.87979054])

In [37]:
df2.iloc[4,-1] = 38.87
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,38.87


In [38]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.16,0.0
44,0.0,0.0,7.31


In [39]:
df3 = df2.copy()
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,38.87


In [40]:
df3.iloc[1,0] = np.NaN
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,38.87
