HELP FROM :
- How to generate bivariate distributions : https://www2.stat.duke.edu/courses/Spring12/sta104.1/Lectures/Lec22.pdf
- Use MCAR, MAR, etc : http://stronginference.com/missing-data-imputation.html

In [193]:
#### The effect of missing data ####

import numpy as np
from sklearn import linear_model
import random
import math
import copy # in order to copy an array or list without modifying it


### a. Preliminaries
mean = [125, 125]
cov = [[625, 375], [375, 625]]


## STEP 1 : Generate 50 data points from the bivariate distribution
X, Y = list(), list()

np.random.seed(1)

for i in range(50): 
    X.append(np.random.multivariate_normal(mean, cov)[0])
    Y.append(np.random.multivariate_normal(mean, cov)[1])    

data = np.asarray([X,Y])
  
    

## STEP 2 : Regress Y on X with an intercept term. Compute the coefficient corresponding to X.

# reshape explanatory variable X
X = data[0].reshape(50,1)

# fit linear regression
lm = linear_model.LinearRegression(fit_intercept=True)
lm.fit(X, Y)

print('.....................................................\n \
      Linear regression with all observations \
      \n.....................................................')
print('The coefficient corresponding to X is ' + str(lm.fit(X, Y).coef_[0]))
print('The intercept is ' + str(lm.fit(X, Y).intercept_))




## STEP 3 : Same question when using MCAR model. 
alpha = 0.73
Y_missing = random.sample(Y, math.floor(50*alpha))

data_MCAR = np.empty((0,2))
for i in range(50):
    if data[1,i] not in Y_missing:
        data_MCAR = np.append(data_MCAR, data[:,i])

data_MCAR.resize(len(data_MCAR)//2, 2)


X_MCAR = data_MCAR[:,0] 
X_MCAR = X_MCAR.reshape(len(X_MCAR),1)

Y_MCAR = data_MCAR[:,1]       

lm_MCAR = linear_model.LinearRegression(fit_intercept=True)
lm_MCAR.fit(X_MCAR, Y_MCAR)

print('\n \n')
print('.............................................\n \
      Missing Completely At Random (MCAR) \
      \n.............................................')

print('The coefficient corresponding to X is ' + str(lm_MCAR.fit(X_MCAR, Y_MCAR).coef_[0]))
print('The intercept is ' + str(lm_MCAR.fit(X_MCAR, Y_MCAR).intercept_))
        


    


.....................................................
       Linear regression with all observations       
.....................................................
The coefficient corresponding to X is -0.00735730853033
The intercept is 118.24457467

 

.............................................
       Missing Completely At Random (MCAR)       
.............................................
The coefficient corresponding to X is 0.104304280467
The intercept is 89.4902439806


In [51]:
       
## STEP 4 : Same question when using MAR model. 
data_MAR = copy.copy(data)

for i in range(50):
    if data_MAR[0,i] <= 140 :
        data_MAR[1,i] = np.nan
        

        
## STEP 5 : Same question when using MNAR model. 
data_MNAR = copy.copy(data)

for i in range(50):
    if data_MNAR[1,i] <= 140 :
        data_MNAR[1,i] = np.nan