In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix



In [3]:
df = pd.read_csv('classification_challenge_training.csv') # read in the census-derived data; mind the separator
df.head()

Unnamed: 0.1,Unnamed: 0,age,type_employer,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,4,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
1,14656,32,Private,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,1
2,26677,29,Private,10th,6,Married-spouse-absent,Adm-clerical,Unmarried,White,Female,0,0,40,Mexico,0
3,4281,30,Private,Assoc-voc,11,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,Mexico,0
4,15668,66,Private,HS-grad,9,Widowed,Priv-house-serv,Not-in-family,White,Female,0,0,8,United-States,0


In [10]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,age,type_employer,education,education_num,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hr_per_week,country,income
0,4,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0


In [5]:
df['marital'].unique()

array(['Married-civ-spouse', 'Married-spouse-absent', 'Widowed',
       'Never-married', 'Divorced', 'Separated', 'Married-AF-spouse'], dtype=object)

In [9]:
df['relationship'].unique()

array(['Husband', 'Wife', 'Unmarried', 'Not-in-family', 'Own-child',
       'Other-relative'], dtype=object)

In [15]:
print list(df.columns)

['Unnamed: 0', 'age', 'type_employer', 'education', 'education_num', 'marital', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hr_per_week', 'country', 'income']


In [19]:
def get_features(dataframe):
   features = patsy.dmatrix('~ C(type_employer) + C(marital) +\
   C(occupation) + C(relationship) + C(race) + C(sex) + C(country)', dataframe)
   X_df = pd.DataFrame(features, columns=features.design_info.column_names)
   X_df.drop('Intercept', axis=1, inplace=True)
   X_df = pd.concat([X_df, dataframe[['age', 'education_num', 'capital_gain', 'capital_loss', 'hr_per_week']]], axis=1)
   return X_df

In [22]:
import patsy

df1 = get_features(df)

In [23]:
df1.dtypes

C(type_employer)[T.Federal-gov]             float64
C(type_employer)[T.Local-gov]               float64
C(type_employer)[T.Never-worked]            float64
C(type_employer)[T.Private]                 float64
C(type_employer)[T.Self-emp-inc]            float64
C(type_employer)[T.Self-emp-not-inc]        float64
C(type_employer)[T.State-gov]               float64
C(type_employer)[T.Without-pay]             float64
C(marital)[T.Married-AF-spouse]             float64
C(marital)[T.Married-civ-spouse]            float64
C(marital)[T.Married-spouse-absent]         float64
C(marital)[T.Never-married]                 float64
C(marital)[T.Separated]                     float64
C(marital)[T.Widowed]                       float64
C(occupation)[T.Adm-clerical]               float64
C(occupation)[T.Armed-Forces]               float64
C(occupation)[T.Craft-repair]               float64
C(occupation)[T.Exec-managerial]            float64
C(occupation)[T.Farming-fishing]            float64
C(occupation

In [4]:
# Check values and their counts
pd.set_option('display.max_rows', 500)
df.apply(lambda c: c.value_counts()).T.stack()

Unnamed: 0    1                                 1.0
              2                                 1.0
              4                                 1.0
              5                                 1.0
              6                                 1.0
              7                                 1.0
              8                                 1.0
              9                                 1.0
              10                                1.0
              12                                1.0
              14                                1.0
              15                                1.0
              17                                1.0
              18                                1.0
              19                                1.0
              20                                1.0
              21                                1.0
              22                                1.0
              23                                1.0
            

In [145]:
"""Clean/munge your data."""

X_train, X_test, Y_train, Y_test = train_test_split(df, target) ## create train-test out of the data given

In [158]:
""" Fit a binary classification predictor."""

y_pred = ?.predict(X_test)
y_score = ?.decision_function(X_test) # Submit these response, the output of model.decision_function

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [12]:
""" Check your performance so far."""

conmat = np.array(confusion_matrix(Y_test, y_pred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['over_50k', 'under_50k'],
                            columns=['predicted_over50k','predicted_under50k'])

print(confusion)
print classification_report(Y_test,y_pred)
roc_auc_score(Y_test, y_score)

""" Plot AUC"""

FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(Y_test, y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[11,9])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Receiver operating characteristic for high/low income', fontsize=18)
plt.legend(loc="lower right")
plt.show()

NameError: name 'Y_test' is not defined