In [6]:
import os
import pandas as pd
data_folder = os.path.join(os.path.expanduser("~"), "Data", "Adult")
adult_filename = os.path.join(data_folder, "adult.data")

In [7]:
adult = pd.read_csv(adult_filename, header=None, names=["Age", "Work-Class", "fnlwgt", "Education",
                                                        "Education-Num", "Marital-Status", "Occupation",
                                                        "Relationship", "Race", "Sex", "Capital-gain",
                                                        "Capital-loss", "Hours-per-week", "Native-Country",
                                                        "Earnings-Raw"])

In [8]:
adult.dropna(how='all', inplace=True)

In [9]:
adult.columns

Index(['Age', 'Work-Class', 'fnlwgt', 'Education', 'Education-Num',
       'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-Country',
       'Earnings-Raw'],
      dtype='object')

In [10]:
adult["Hours-per-week"].describe()

count    32561.000000
mean        40.437456
std         12.347429
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: Hours-per-week, dtype: float64

In [11]:
adult["Education-Num"].median()

10.0

In [12]:
adult["Work-Class"].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'], dtype=object)

In [13]:
import numpy as np
X = np.arange(30).reshape((10, 3))

In [14]:
X

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14],
       [15, 16, 17],
       [18, 19, 20],
       [21, 22, 23],
       [24, 25, 26],
       [27, 28, 29]])

In [15]:
X[:,1] = 1

In [16]:
X

array([[ 0,  1,  2],
       [ 3,  1,  5],
       [ 6,  1,  8],
       [ 9,  1, 11],
       [12,  1, 14],
       [15,  1, 17],
       [18,  1, 20],
       [21,  1, 23],
       [24,  1, 26],
       [27,  1, 29]])

In [17]:
from sklearn.feature_selection import VarianceThreshold

In [18]:
vt = VarianceThreshold()
Xt = vt.fit_transform(X)

In [19]:
Xt

array([[ 0,  2],
       [ 3,  5],
       [ 6,  8],
       [ 9, 11],
       [12, 14],
       [15, 17],
       [18, 20],
       [21, 23],
       [24, 26],
       [27, 29]])

In [20]:
print(vt.variances_)

[ 74.25   0.    74.25]


In [21]:
X = adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]].values
y = (adult["Earnings-Raw"] == ' >50K').values
print(X)

[[   39    13  2174     0    40]
 [   50    13     0     0    13]
 [   38     9     0     0    40]
 ..., 
 [   58     9     0     0    40]
 [   22     9     0     0    20]
 [   52     9 15024     0    40]]


In [22]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
transformer = SelectKBest(score_func=chi2, k=3)

In [23]:
Xt_chi2 = transformer.fit_transform(X, y)
print(transformer.scores_)

[  8.60061182e+03   2.40142178e+03   8.21924671e+07   1.37214589e+06
   6.47640900e+03]


In [24]:
from scipy.stats import pearsonr

def multivariate_pearsonr(X, y):
    scores, pvalues = [], []
    for column in range(X.shape[1]):
        cur_score, cur_p = pearsonr(X[:,column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))

In [25]:
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)

[ 0.2340371   0.33515395  0.22332882  0.15052631  0.22968907]


In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')



In [27]:
print("Chi2 performance: {0:.3f}".format(scores_chi2.mean()))
print("Pearson performance: {0:.3f}".format(scores_pearson.mean()))

Chi2 performance: 0.829
Pearson performance: 0.771


In [36]:
from sklearn.base import TransformerMixin
from sklearn.utils import as_float_array

class MeanDiscrete(TransformerMixin):
    def fit(self, X, y=None):
        X = as_float_array(X)
        self.mean = np.mean(X, axis=0)
        return self

    def transform(self, X):
        X = as_float_array(X)
        assert X.shape[1] == self.mean.shape[0]
        return X > self.mean

In [37]:
mean_discrete = MeanDiscrete()

In [38]:
X_mean = mean_discrete.fit_transform(X)

In [43]:
#%%file adult_tests.py
import numpy as np
from numpy.testing import assert_array_equal

def test_meandiscrete():
    X_test = np.array([[ 0,  2],
                        [ 3,  5],
                        [ 6,  8],
                        [ 9, 11],
                        [12, 14],
                        [15, 17],
                        [18, 20],
                        [21, 23],
                        [24, 26],
                        [27, 29]])
    #print(X_test)
    mean_discrete = MeanDiscrete()
    mean_discrete.fit(X_test)
    print(mean_discrete.mean)
    assert_array_equal(mean_discrete.mean, np.array([13.5, 15.5]))
    #print(assert_array_equal)
    X_transformed = mean_discrete.transform(X_test)
    X_expected = np.array([[ 0,  0],
                            [ 0, 0],
                            [ 0, 0],
                            [ 0, 0],
                            [ 0, 0],
                            [ 1, 1],
                            [ 1, 1],
                            [ 1, 1],
                            [ 1, 1],
                            [ 1, 1]])
    assert_array_equal(X_transformed, X_expected)

In [44]:
test_meandiscrete()

[ 13.5  15.5]


In [100]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('mean_discrete', MeanDiscrete()),
                     ('classifier', DecisionTreeClassifier(random_state=14))])
scores_mean_discrete = cross_val_score(pipeline, X, y, scoring='accuracy')

[[  5.40000000e+01   1.00000000e+01   0.00000000e+00   0.00000000e+00
    4.00000000e+01]
 [  2.50000000e+01   5.00000000e+00   0.00000000e+00   0.00000000e+00
    4.00000000e+01]
 [  2.10000000e+01   1.20000000e+01   0.00000000e+00   0.00000000e+00
    2.50000000e+01]
 ..., 
 [  5.80000000e+01   9.00000000e+00   0.00000000e+00   0.00000000e+00
    4.00000000e+01]
 [  2.20000000e+01   9.00000000e+00   0.00000000e+00   0.00000000e+00
    2.00000000e+01]
 [  5.20000000e+01   9.00000000e+00   1.50240000e+04   0.00000000e+00
    4.00000000e+01]] [   38.64988253    10.07716405  1076.19961303    87.22011333    40.38973603]
[[   39.    13.  2174.     0.    40.]
 [   50.    13.     0.     0.    13.]
 [   38.     9.     0.     0.    40.]
 ..., 
 [   38.     9.     0.  1977.    99.]
 [   60.    13.  7298.     0.    40.]
 [   27.    10.     0.     0.    40.]] [   38.64988253    10.07716405  1076.19961303    87.22011333    40.38973603]
[[  3.90000000e+01   1.30000000e+01   2.17400000e+03   0.00000

In [113]:
print("Mean Discrete performance: {0:.2f}".format(scores_mean_discrete.mean()))

Mean Discrete performance: 0.80
