# Linear Regression Model of Wheat Yield Data with and without Differential Privacy 

In [1]:
from diffprivlib.models import LinearRegression
import diffprivlib.models as dp
import diffprivlib.tools as tl
from diffprivlib.mechanisms import Laplace
from sklearn.linear_model import LinearRegression as sk_LinearRegression
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as err
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#conda activate IntroML1


In [2]:
data_1 = pd.read_csv("wheat-2013-supervised.csv")
data_2 = pd.read_csv("wheat-2014-supervised.csv")
data = pd.concat([data_1, data_2],axis=0)

data['EstDistEquator'] = data['Latitude'] * 69

data_new = data[['humidity','cloudCover', 'pressure','DayInSeason','dewPoint','windBearing','temperatureMax','EstDistEquator','NDVI','visibility', 'Yield' ]]

data_new = data_new.dropna(axis=0)

y = data_new[['Yield']]
x = data_new.drop(['Yield'],axis=1)

In [3]:
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()) 
    
])

In [4]:
x = num_pipeline.fit_transform(x)

In [5]:
# Split Train and Test data 
x_train, x_test, y_train, y_test = train_test_split(x,y)



# Regular Linear Regression

In [8]:
regr = sk_LinearRegression()
regr.fit(x_train, y_train)
baseline = regr.score(x_test, y_test)
tr_baseline = regr.score(x_train,y_train)
print("Non-privacy baseline R2 score on Test Data: %.2f" % baseline)
print("Non-privacy baseline R2 score on Train Data: %.2f" % tr_baseline)

Non-privacy baseline R2 score on Test Data: 0.22
Non-privacy baseline R2 score on Train Data: 0.21


In [9]:
predictions = regr.predict(x_test)

MSE = err(y_test,predictions)

print(MSE)

136.38457477032804


# Linear Regression with Privacy Model with Epsilon = 1

In [10]:
regr = LinearRegression(epsilon=1)
regr.fit(x_train, y_train)


print("R2 score for Test Data epsilon=%.2f: %.2f" % (regr.epsilon, regr.score(x_test, y_test)))
print("R2 score for Train Data epsilon=%.2f: %.2f" % (regr.epsilon, regr.score(x_train, y_train)))

R2 score for Test Data epsilon=1.00: 0.21
R2 score for Train Data epsilon=1.00: 0.20


This will result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `bounds_X` and `bounds_y`.


In [11]:
predictions = regr.predict(x_test)

MSEPriv1 = err(y_test,predictions)

print(MSEPriv1)

137.28549313414914


# Linear Regression with Privacy Model with Epsilon = .9

In [12]:
regr = LinearRegression(epsilon=.9)
regr.fit(x_train, y_train)



print("R2 score for Test Data epsilon=%.2f: %.2f" % (regr.epsilon, regr.score(x_test, y_test)))
print("R2 score for Train Data epsilon=%.2f: %.2f" % (regr.epsilon, regr.score(x_train, y_train)))

R2 score for Test Data epsilon=0.90: 0.20
R2 score for Train Data epsilon=0.90: 0.19


This will result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `bounds_X` and `bounds_y`.


In [13]:
predictions = regr.predict(x_test)

MSEPriv9 = err(y_test,predictions)

print(MSEPriv9)

139.69903650452972
