## Performing Cloning with Same Black Box and White Box Classifiers

In [19]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import pickle as pkl
from collections import Counter
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def generate_random(orig_data, feature_names, data_size):
    '''
    Input: Data, Feature Names, Size of Random Data
    Output: Random Data
    
    Generates random dataset of size data_size
    Random dataset is generated from normal
    distribution with specified high and low values
    '''
    
    df = {}
    for i in range(len(feature_names)):
        low = np.min(orig_data[:, i])
        high = np.max(orig_data[:, i])
        df[feature_names[i]] = np.random.uniform(low=low, high=high, size=data_size)
    random_dataset = pd.DataFrame(data=df, columns=feature_names)
    
    return random_dataset

## Random Forests

In [9]:
## Fitting black box to original data
print '========================== Heart Dataset ============================'
data = pd.read_csv('data/heart.csv')
data['famhist'] = data['famhist'].map({"Absent": 0, "Present":1})
y = data.pop("chd").values
X = data.values
feature_names = data.columns

blackbox = RandomForestClassifier()
blackbox.fit(X, y)
print 'Blackbox score on the random dataset: ', blackbox.score(X, y)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = RandomForestClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

Blackbox score on the random dataset:  0.963203463203
Whitebox score when trained with random dataset:  0.69696969697


In [14]:
from sklearn.cross_validation import train_test_split

In [16]:
# Breast Cancer Dataset
print '========================== Breast Cancer Dataset =========================='
data = datasets.load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = RandomForestClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(569, 30) (569,)
(398, 30)
(171, 30)
Blackbox score on the random dataset:  0.947368421053
Whitebox score when trained with random dataset:  0.469244288225


In [25]:
## Adult Census dataset
print '========================== Adult Census Dataset ==========================='
with open('../../Dataset/Census/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Census/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = RandomForestClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(32561, 107) (32561,)
Blackbox score on the random dataset:  0.843279762514
Whitebox score when trained with random dataset:  0.743896071988


In [29]:
 ## Bridges Dataset
print '========================= Bridges Dataset ============================='
with open('../../Dataset/Bridges/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Bridges/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = RandomForestClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(106, 29) (106,)
Blackbox score on the random dataset:  0.9375
Whitebox score when trained with random dataset:  0.650943396226


In [32]:
print '======================== Mushroom Dataset ==========================='
with open('../../Dataset/Mushroom/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Mushroom/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = RandomForestClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(8124, 112) (8124,)
Blackbox score on the random dataset:  1.0
Whitebox score when trained with random dataset:  0.482151649434


## Logistic Regression

In [57]:
## Fitting black box to original data
print '========================== Heart Dataset ============================'
data = pd.read_csv('data/heart.csv')
data['famhist'] = data['famhist'].map({"Absent": 0, "Present":1})
y = data.pop("chd").values
X = data.values
feature_names = data.columns

blackbox = LogisticRegression()
blackbox.fit(X, y)
print 'Blackbox score on the random dataset: ', blackbox.score(X, y)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = LogisticRegression()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

Blackbox score on the random dataset:  0.733766233766
Whitebox score when trained with random dataset:  0.722943722944


In [46]:
# Breast Cancer Dataset
print '========================== Breast Cancer Dataset =========================='
data = datasets.load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = LogisticRegression()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = LogisticRegression()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(569, 30) (569,)
Blackbox score on the random dataset:  0.906432748538
Whitebox score when trained with random dataset:  0.93848857645


In [47]:
## Adult Census dataset
print '========================== Adult Census Dataset ==========================='
with open('../../Dataset/Census/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Census/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = LogisticRegression()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = LogisticRegression()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(32561, 107) (32561,)
Blackbox score on the random dataset:  0.798239328488
Whitebox score when trained with random dataset:  0.792113264335


In [49]:
 ## Bridges Dataset
print '========================= Bridges Dataset ============================='
with open('../../Dataset/Bridges/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Bridges/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = LogisticRegression()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = LogisticRegression()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(106, 29) (106,)
Blackbox score on the random dataset:  0.90625
Whitebox score when trained with random dataset:  0.905660377358


In [52]:
print '======================== Mushroom Dataset ==========================='
with open('../../Dataset/Mushroom/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Mushroom/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = LogisticRegression()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = LogisticRegression()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(8124, 112) (8124,)
Blackbox score on the random dataset:  0.999589827728
Whitebox score when trained with random dataset:  0.655711472181


## KNearest Neighbor

In [58]:
## Fitting black box to original data
print '========================== Heart Dataset ============================'
data = pd.read_csv('data/heart.csv')
data['famhist'] = data['famhist'].map({"Absent": 0, "Present":1})
y = data.pop("chd").values
X = data.values
feature_names = data.columns

blackbox = KNeighborsClassifier()
blackbox.fit(X, y)
print 'Blackbox score on the random dataset: ', blackbox.score(X, y)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = KNeighborsClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

Blackbox score on the random dataset:  0.720779220779
Whitebox score when trained with random dataset:  0.731601731602


In [66]:
# Breast Cancer Dataset
print '========================== Breast Cancer Dataset =========================='
data = datasets.load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = KNeighborsClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = KNeighborsClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(569, 30) (569,)
Blackbox score on the random dataset:  0.941520467836
Whitebox score when trained with random dataset:  0.903339191564


In [61]:
## Adult Census dataset
print '========================== Adult Census Dataset ==========================='
with open('../../Dataset/Census/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Census/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = KNeighborsClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = KNeighborsClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(32561, 107) (32561,)
Blackbox score on the random dataset:  0.775104923738
Whitebox score when trained with random dataset:  0.264733884094


In [62]:
 ## Bridges Dataset
print '========================= Bridges Dataset ============================='
with open('../../Dataset/Bridges/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Bridges/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = KNeighborsClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = KNeighborsClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(106, 29) (106,)
Blackbox score on the random dataset:  0.875
Whitebox score when trained with random dataset:  0.88679245283


In [63]:
print '======================== Mushroom Dataset ==========================='
with open('../../Dataset/Mushroom/train.pkl', 'rb') as fp:
    data = pkl.load(fp)
with open('../../Dataset/Mushroom/label.pkl', 'rb') as fp:
    label = pkl.load(fp)
feature_names = data.columns
X = data.values
y = label
print X.shape, y.shape

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.3)

blackbox = KNeighborsClassifier()
blackbox.fit(X_train, y_train)
print 'Blackbox score on the random dataset: ', blackbox.score(X_test, y_test)

## Generating random samples
X_random = generate_random(X, feature_names, data_size=1000).values
y_random = blackbox.predict(X_random)
whitebox = KNeighborsClassifier()
whitebox.fit(X_random, y_random)
print 'Whitebox score when trained with random dataset: ', whitebox.score(X, y)

(8124, 112) (8124,)
Blackbox score on the random dataset:  1.0
Whitebox score when trained with random dataset:  0.697193500739
