In [375]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
import numpy as np
from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import SGDClassifier
import random
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.metrics import log_loss
from statsmodels.tools.tools import add_constant

In [233]:
#Load Data
features = [i.replace(' ', '_') for i in load_breast_cancer().feature_names.tolist()]

breast_cancer_df = pd.DataFrame(load_breast_cancer().data,columns=features)
target_df = pd.DataFrame(load_breast_cancer().target, columns=['y'])

df = pd.concat([target_df,breast_cancer_df],axis=1)

In [234]:
#Create small data set and rare event data set
num_successes_for_rare = int(((((1-df.y.mean())*df.shape[0])/.95)-((1-df.y.mean())*df.shape[0]))//1)
rare_inds = sorted(list(df[df.y==0].index) + random.sample(list(df[df.y==1].index),num_successes_for_rare))
small_inds = random.sample(sorted(list(df.index)),50)
rare_df = df.iloc[rare_inds,:]
small_df = df.iloc[small_inds,:]

In [289]:
def DAP(df,y_var_name):
    '''Perform log-f(1,1) data augmentation
       Returns augmented df and observation weights'''
    
    num_rows = 2*(df.shape[1]-1)
    y_ind = df.columns.get_loc(y_var_name)
    
    aug = pd.DataFrame(0,columns=df.columns,index=(range(num_rows)))
    
    #augment y variable
    aug.iloc[range(0,num_rows,2),y_ind]=1
    y = aug[y_var_name]
    
    #augment X variables
    X = aug.drop(y_var_name,axis=1)
    for ind, rows in enumerate(range(0,X.shape[0],2)):
         X.iloc[rows:rows+2,ind]=1
    
    #bring it all together
    aug = pd.concat([y,X],axis=1)
    f_df = df.append(aug)
    
    #add offset
    f_df['real_data']=1
    f_df['real_data'][-aug.shape[0]:]=0
    
    #reseparate
    X = f_df.drop(y_var_name,axis=1)
    y = f_df[y_var_name]
    #Calculate weights
    weights = f_df['real_data'].apply(lambda x: 0.5 if x == 0 else 1)
    return X, y, weights      

In [290]:
all_X,all_y, all_weights = DAP(df,'y')
small_X,small_y, small_weights = DAP(small_df,'y')
rare_X,rare_y, rare_weights = DAP(rare_df,'y')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [333]:
#Convert data to R
with localconverter(ro.default_converter + pandas2ri.converter):
    #for firth
    firth_all_r = ro.conversion.py2ri(df)
    firth_small_r = ro.conversion.py2ri(small_df)
    firth_rare_r = ro.conversion.py2ri(rare_df)
    
    #for logf
    logf_all_X_r = ro.conversion.py2ri(all_X)
    logf_all_y_r = ro.vectors.IntVector(all_y)
    all_weights_r = ro.vectors.FloatVector(all_weights)
    logf_small_X_r = ro.conversion.py2ri(small_X)
    logf_small_y_r = ro.vectors.IntVector(small_y)
    small_weights_r = ro.vectors.FloatVector(small_weights)
    logf_rare_X_r = ro.conversion.py2ri(rare_X)
    logf_rare_y_r = ro.vectors.IntVector(rare_y)
    rare_weights_r = ro.vectors.FloatVector(rare_weights)

In [93]:
base = importr('base')
d = {'package.dependencies': 'package_dot_dependencies',
     'package_dependencies': 'package_uscore_dependencies'}
brglm = importr('brglm',robject_translations=d)

In [94]:
firth_formula = 'y ~ ' + " + ".join(features)
logf_formula = 'y ~ ' +' + '.join(list(small_aug.columns)[1:])

In [249]:
model = brglm.brglm('y ~ .', data = logf_small_r, family='binomial',pl=True)
small_summary = base.summary(model)
small_summary_dic = {}
for i in range(len(small_summary.names)):
    try:
        small_summary_dic[small_summary.names[i]]=pandas2ri.converter.ri2py(list(small_summary)[i])
    except:
        pass

small_coefs = pd.DataFrame(small_summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=all_aug.columns)
# small_preds_r = ro.r.predict(model,firth_all_r)

In [348]:
def get_r_firth_results(df, test_df, formula):
    model = brglm.brglm(formula, data = df, family='binomial',pl=True)
    summary = base.summary(model)
    summary_dic = {}
    for i in range(len(summary.names)):
        try:
            summary_dic[summary.names[i]]=pandas2ri.converter.ri2py(list(summary)[i])
        except:
            pass
    columns = list(df.colnames)
    columns[0]='Intercept'
    coefs = pd.DataFrame(summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=columns)
    preds = ro.r.predict(model,firth_all_r)
    preds = list(preds)             
    return preds, coefs


def logf(X,y,len_orig_data):
    X = X.append(X.iloc[:len_orig_data,:])
    y = y.append(y.iloc[:len_orig_data])
    model = sm.Logit(y,X).fit()
    return model

def get_r_logf_results(X, y, test_X, weights):
    columns = X.colnames
    model = brglm.brglm_fit(X,y,intercept=False,weights=weights)
#     summary = base.summary(test)
#     print(summary)
    summary_dic = {}
    for i in range(len(model.names)):
        try:
            summary_dic[model.names[i]]=pandas2ri.converter.ri2py(list(model)[i])
        except:
            pass
    print(summary_dic['coefficients'])
    coefs = pd.DataFrame(summary_dic['coefficients'],index=columns,columns=(['Coef']))
    preds = ro.r.predict(model,test_X)
    preds = list(preds)
    return preds, coefs

In [None]:
def Sigmoid_Pred(X, weights):
    z = np.dot(X,weights)
    sig =  (1 + np.exp(-1*z))**-1
    sig = np.clip(sig,.000001,.999999)
    return sig

In [392]:
#Small Comparison
control = sm.Logit(small_y,add_constant(small_X)).fit()
logf_model = logf(small_X,small_y,df.shape[0])
firth_small_pred, firth_small_coef = get_r_firth_results(firth_small_r,firth_all_r,formula)
firth = Sigmoid_Pred(add_constant(breast_cancer_df),firth_small_coef.Coef)

print(accuracy_score(all_y, control.predict(add_constant(all_X)).round()))
print(accuracy_score(all_y,logf_model.predict(all_X).round()))
print(accuracy_score(target_df,firth.round()))


print(log_loss(all_y, control.predict(add_constant(all_X))))
print(log_loss(all_y,logf_model.predict(all_X)))
print(log_loss(target_df,firth))

Optimization terminated successfully.
         Current function value: 0.381690
         Iterations 16
Optimization terminated successfully.
         Current function value: 0.382146
         Iterations 20
0.8918918918918919
0.8871224165341812
0.8506151142355008
0.6793652151854367
1.003324088513801
0.42975553593490895


  return ptp(axis=axis, out=out, **kwargs)


In [393]:
#rare Comparison
control = sm.Logit(rare_y,add_constant(rare_X)).fit()
logf_model = logf(rare_X,rare_y,df.shape[0])
firth_rare_pred, firth_rare_coef = get_r_firth_results(firth_rare_r,firth_all_r,formula)
firth = Sigmoid_Pred(add_constant(breast_cancer_df),firth_rare_coef.Coef)

print(accuracy_score(all_y, control.predict(add_constant(all_X)).round()))
print(accuracy_score(all_y,logf_model.predict(all_X).round()))
print(accuracy_score(target_df,firth.round()))


print(log_loss(all_y, control.predict(add_constant(all_X))))
print(log_loss(all_y,logf_model.predict(all_X)))
print(log_loss(target_df,firth))

Optimization terminated successfully.
         Current function value: 0.158914
         Iterations 16
Optimization terminated successfully.
         Current function value: 0.159815
         Iterations 15
0.821939586645469
0.8171701112877583
0.804920913884007
0.8695867064765834
0.7995288142219448
0.5576147241742745


  return ptp(axis=axis, out=out, **kwargs)


In [266]:
# def get_r_logf_results(df, test_df, formula, weights):
#     model = brglm.brglm(formula, data = df, family='binomial', weights=weights)
#     summary = base.summary(model)
#     summary_dic = {}
#     for i in range(len(summary.names)):
#         try:
#             summary_dic[summary.names[i]]=pandas2ri.converter.ri2py(list(summary)[i])
#         except:
#             pass
#     summary_dic['coefficients']
#     coefs = pd.DataFrame(summary_dic['coefficients'],index=df.colnames,columns=(['Coef','SE','Z','P']))
#     preds = ro.r.predict(model,firth_all_r)                  
#     return preds, coefs