# Random Forest Classification of Wheat Yield Data with and without Differential Privacy 

In [13]:
from diffprivlib.models import LinearRegression
import diffprivlib.models as dp
import diffprivlib.tools as tl
from diffprivlib.mechanisms import Laplace
from sklearn.linear_model import LinearRegression as sk_LinearRegression
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as err
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier
from sklearn.datasets import make_classification
from diffprivlib.models import RandomForestClassifier
%matplotlib inline
from sklearn import metrics

#conda activate IntroML1


# Random Forest Classification of Wheat Yield data with and without Differential Privacy (DP)
#Source: https://arxiv.org/abs/1606.03572



In [23]:
data_1 = pd.read_csv("wheat-2013-supervised.csv")
data_2 = pd.read_csv("wheat-2014-supervised.csv")
data = pd.concat([data_1, data_2],axis=0)

data['EstDistEquator'] = data['Latitude'] * 69

# Conditions for Yield Categories (Mean=32, Min=9, Max=78, STD=13)

conditions = [
    (data['Yield'] <= 25),
    (data['Yield'] > 25) & (data['Yield'] <= 39),
    (data['Yield'] > 39) 
    ]


# 0 = low, 1 = Average, 2 = High

values = [0,1,2]

data['Yield_Cat'] = np.select(conditions,values)


data_new = data[['humidity','cloudCover', 'pressure','DayInSeason','dewPoint','windBearing','temperatureMax','NDVI','visibility','EstDistEquator', 'Yield_Cat' ]]

data_new = data_new.dropna(axis=0)

y = data_new[['Yield_Cat']]
x = data_new.drop(['Yield_Cat'],axis=1)

In [24]:
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()) 
    
])

In [25]:
x = num_pipeline.fit_transform(x)

In [26]:
# Split Train and Test data 
x_train, x_test, y_train, y_test = train_test_split(x,y)



In [46]:
#Regular Random Forest 

clf=sk_RandomForestClassifier(n_estimators=100,n_jobs=2)
clf.fit(x_train, np.ravel(y_train))


RandomForestClassifier(n_estimators=250, n_jobs=2)

In [47]:
predictions = clf.predict(x_test)


In [48]:
print("Accuracy without DP:",metrics.accuracy_score(y_test, predictions))

Accuracy without DP: 0.8554481008713846


In [49]:
#Privacy Model epsilon = 1

clf_priv = RandomForestClassifier(n_estimators=250,epsilon=1)
clf_priv.fit(x_train, np.ravel(y_train))




This may result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `feature_domains` according to the documentation


RandomForestClassifier(accountant=BudgetAccountant(spent_budget=[(1.0, 0), (1, 0), (0.9, 0), (1, 0), (0.9, 0), ...]),
                       epsilon=1, n_estimators=250)

In [50]:
predictions_DP = clf_priv.predict(x_test)

In [51]:
print("Accuracy with Privacy Espilon 1:",metrics.accuracy_score(y_test, predictions_DP))

Accuracy with Privacy Espilon 1: 0.3869481509509554


In [33]:
#Privacy Model epsilon = .9

clf_priv9 = RandomForestClassifier(n_estimators=100,epsilon=.9)
clf_priv9.fit(x_train,np.ravel(y_train))


This may result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `feature_domains` according to the documentation


RandomForestClassifier(accountant=BudgetAccountant(spent_budget=[(1.0, 0), (1, 0), (0.9, 0), (1, 0), (0.9, 0)]),
                       epsilon=0.9, n_estimators=100)

In [34]:
predictions_DP9 = clf_priv9.predict(x_test)

print("Accuracy with Privacy Espilon .9:",metrics.accuracy_score(y_test, predictions_DP9))

Accuracy with Privacy Espilon .9: 0.3934918815451217


In [35]:
clf_priv9 = RandomForestClassifier(n_estimators=100,epsilon=.1)
clf_priv9.fit(x_train,np.ravel(y_train))

This may result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `feature_domains` according to the documentation


RandomForestClassifier(accountant=BudgetAccountant(spent_budget=[(1.0, 0), (1, 0), (0.9, 0), (1, 0), (0.9, 0), ...]),
                       epsilon=0.1, n_estimators=100)

In [36]:
clf_priv9.fit(x_train,np.ravel(y_train))

This may result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `feature_domains` according to the documentation


RandomForestClassifier(accountant=BudgetAccountant(spent_budget=[(1.0, 0), (1, 0), (0.9, 0), (1, 0), (0.9, 0), ...]),
                       epsilon=0.1, n_estimators=100)

In [37]:
predictions_DP9 = clf_priv9.predict(x_test)

print("Accuracy with Privacy Espilon .9:",metrics.accuracy_score(y_test, predictions_DP9))

Accuracy with Privacy Espilon .9: 0.3865475143839656
