In [3]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
import numpy as np
from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import SGDClassifier
import random
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.metrics import log_loss
from statsmodels.tools.tools import add_constant
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from statsmodels.genmod.generalized_linear_model import GLM

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [583]:
#Load Data
features = [i.replace(' ', '_') for i in load_breast_cancer().feature_names.tolist()]

breast_cancer_df = pd.DataFrame(load_breast_cancer().data,columns=features)
target_df = pd.DataFrame(load_breast_cancer().target, columns=['y'])
X = breast_cancer_df
y = target_df

df = pd.concat([target_df,breast_cancer_df],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=0)

In [750]:
# Create small, rare and perfectly-separated datasets

all_df = df.iloc[:,:6]
all_X = all_df.drop('y',axis=1)
all_y = all_df['y']

rare_df = df.iloc[rare_inds,:6]
rare_X = rare_df.drop('y',axis=1)
rare_y = rare_df['y']

separation_df = pd.read_csv('separation_df.csv',index_col=0).iloc[:,:6]
separation_X = separation_df.drop('y',axis=1)
separation_y = separation_df['y']

small_df = pd.read_csv('small_df.csv',index_col=0).iloc[:,:6]
small_X = small_df.drop('y',axis=1)
small_y = small_df['y']


In [895]:
#Helper functions
def formula_from_df(df,y_var_name):
    features = list((df.drop(y_var_name,axis=1).columns))
    formula = y_var_name + '~' + ' + '.join(features)
    return formula

def Sigmoid_Pred(X, weights):
    z = np.dot(X,weights)
    sig =  (1 + np.exp(-1*z))**-1
    sig = np.clip(sig,.000001,.999999)
    return sig

In [None]:
#import R package brglm
base = importr('base')
d = {'package.dependencies': 'package_dot_dependencies',
     'package_dependencies': 'package_uscore_dependencies'}
brglm = importr('brglm',robject_translations=d)

In [909]:
def firth_logit(df,y_var_name,  r_df=True, weights='none',all_coef_summary=False):
    #convert data frame to R df
    if r_df==False:
        with localconverter(ro.default_converter + pandas2ri.converter):
            df = ro.conversion.py2ri(df)
    #form regression formula
    forumla = formula_from_df(df,y_var_name)
    
    #create firth logit model
    if weights!='none':
        model = brglm.brglm(formula, data = df, family='binomial',pl=True, weights=weights)
    else:
        model = brglm.brglm(formula, data = df, family='binomial',pl=True, weights=weights)
    
    #extract coefficients
    summary = base.summary(model)
    summary_dic = {}
    for i in range(len(summary.names)):
        try:
            summary_dic[summary.names[i]]=pandas2ri.converter.ri2py(list(summary)[i])
        except:
            pass
    columns = list(df.colnames)
    columns[0]='Intercept'
    if all_coef_summary==True:
        coefs = pd.DataFrame(summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=columns)
    else:
        coefs = pd.DataFrame(summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=columns).Coef
    
    #get raw output and apply sigmoid
    preds = ro.r.predict(model,df)
    preds = Sigmoid_Pred(list(preds))          
    return preds, coefs

In [None]:
def FLIC(df, formula):
    with localconverter(ro.default_converter + pandas2ri.converter):
        df_r = ro.conversion.py2ri(df)
    model = brglm.brglm(formula, data = df_r, family='binomial',pl=True)
    summary = base.summary(model)
    summary_dic = {}
    for i in range(len(summary.names)):
        try:
            summary_dic[summary.names[i]]=pandas2ri.converter.ri2py(list(summary)[i])
        except:
            pass
    columns = list(firth_small_r.colnames)
    coefs = pd.DataFrame(summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=columns).Coef[1:]
    y_var_name = formula.split('~')[0].strip()
    y = df[y_var_name].values
    X = add_constant(df.drop(y_var_name,axis=1))
    eta = np.dot(X,coefs)
    target = y-eta
    b0_model = sm.OLS(target,np.ones(y.shape[0])).fit()
    b0 = b0_model.params[0]
    coefs['intercept']=b0
    preds = np.dot(X.values,coefs.values)
    return preds,coefs

In [904]:
def FLAC(df,y_var_name):
    
    init_rows = df.shape[0]
    total_rows = init_rows*3
    X = add_constant(df.drop(y_var_name,axis=1))
    y = df[y_var_name]
    
    #Build Hat Matrix = (W**0.5)*X*((XtWX)^-1)*Xt*W**0.5
    model = sm.Logit(y,X).fit()
    weights = model.params
    y_pred = model.predict(X)
    error = y_pred*(1-y_pred)
    W = np.diag(error)
    invXtWX = np.linalg.inv(np.linalg.multi_dot([X.transpose(),W,X]))
    hat = np.diag(np.linalg.multi_dot([W**0.5,X,invXtWX,X.transpose(),W**0.5]))
    
    #Duplicate every row
    double_df = df.append(df)
    
    #Create a new copy of the original data
    pseudo_y_df = df
    #Change y to 1-y
    pseudo_y_df[y_var_name]=1-pseudo_y_df[y_var_name]
    
    #Append to doubled df
    aug_df = double_df.append(pseudo_y_df)
    
    #Create dummy for real vs. duplicated/pseudo data
    aug_df['real_data'] = 0
    aug_df['real_data'][init_rows:]=1
    print(aug_df.columns)
    
    #Create regression formula
    formula = formula_from_df(aug_df,y_var_name)
    print(formula)
    
    #Create vector of weights = 1 for real data, hi/2 for augmentation data
    aug_sample_weights = pd.Series(np.concatenate([np.ones(init_rows),hat/2,hat/2]))
    
    #convert to R
    with localconverter(ro.default_converter + pandas2ri.converter):
        aug_df_r = ro.conversion.py2ri(aug_df)
        aug_sample_weights_r = ro.vectors.FloatVector(aug_sample_weights)
    print(aug_df_r)
    #Get predictions and coefficients
    preds, coefs = get_r_firth_results(aug_df_r,formula,weights=aug_sample_weights_r)
    preds = Sigmoid_Pred(X,coefs.Coef.values)
    return preds, coefs
    

In [911]:
def logF11(df,y_var_name,intercept=False):
    '''Perform log-f(1,1) data augmentation
       Returns augmented df and observation weights'''
    
    num_rows = 2*(df.shape[1]-1)
    y_ind = df.columns.get_loc(y_var_name)
    
    aug = pd.DataFrame(0,columns=df.columns,index=(range(num_rows)))
    
    #augment y variable
    aug.iloc[range(0,num_rows,2),y_ind]=1
    y = aug[y_var_name]
    
    #augment X variables
    X = aug.drop(y_var_name,axis=1)
    for ind, rows in enumerate(range(0,X.shape[0],2)):
         X.iloc[rows:rows+2,ind]=1
    
    #bring it all together
    aug = pd.concat([y,X],axis=1)
    f_df = df.append(aug)
    
    #add offset
    f_df['real_data']=1
    f_df['real_data'][-aug.shape[0]:]=0
    
    #reseparate
    X = f_df.drop(y_var_name,axis=1)
    y = f_df[y_var_name]
    
    #Calculate weights
    weights = f_df['real_data'].apply(lambda x: 0.5 if x == 0 else 1)
    model = sm.Logit(y,X).fit()
    coefs = logit.params
    if intercept==True:
        eta = np.dot(X,coefs)
        target = y-eta
        b0_model = sm.OLS(target,np.ones(y.shape[0])).fit()
        b0 = b0_model.params[0]
        coefs['intercept']=b0
        #how do I get this match above?
    preds = Sigmoid_Pred(X.values,coefs)
    return preds, coefs

Intermediate versions that may be worth keeping 

In [790]:
#log-F(1,1) just augmentation
def logF11_aug(df,y_var_name,R=False):
    '''Perform log-f(1,1) data augmentation
       Returns augmented df and observation weights'''
    
    num_rows = 2*(df.shape[1]-1)
    y_ind = df.columns.get_loc(y_var_name)
    
    aug = pd.DataFrame(0,columns=df.columns,index=(range(num_rows)))
    
    #augment y variable
    aug.iloc[range(0,num_rows,2),y_ind]=1
    y = aug[y_var_name]
    
    #augment X variables
    X = aug.drop(y_var_name,axis=1)
    for ind, rows in enumerate(range(0,X.shape[0],2)):
         X.iloc[rows:rows+2,ind]=1
    
    #bring it all together
    aug = pd.concat([y,X],axis=1)
    f_df = df.append(aug)
    
    #add offset
    f_df['real_data']=1
    f_df['real_data'][-aug.shape[0]:]=0
    
    #Calculate weights
    weights = f_df['real_data'].apply(lambda x: 0.5 if x == 0 else 1)
    if R==True:
        with localconverter(ro.default_converter + pandas2ri.converter):
    
            f_df = ro.conversion.py2ri(f_df)
            weights = ro.vectors.FloatVector(weights)
    return f_df, weights      

In [787]:
#FLAC just augmentation
def FLAC_aug(df,y_var_name,R=False):
    '''Perform FLAC data augmentation
       Returns augmented df and observation weights'''
    
    init_rows = df.shape[0]
    X = add_constant(df.drop(y_var_name,axis=1))
    y = df[y_var_name]
    glm = GLM(y,X,family=sm.families.Binomial()).fit()
#     hat = glm.get_hat_matrix_diag()
    weights = glm.params
    y_pred = glm.predict(test_X)
    W = np.diag(y_pred*(1-y_pred))
    test_XtWtest_X = np.linalg.multi_dot([test_X.transpose(),W,test_X])
    I = np.linalg.inv(test_XtWtest_X)
    hat = np.diag(np.linalg.multi_dot([W**0.5,test_X,I,test_X.transpose(),W**0.5]))

    aug_df = pd.concat([df,df,df])
    aug_df[y_var_name][init_rows*2:]=1-aug_df[y_var_name][init_rows*2:]
    aug_df['pseudo_data'] = 0
    aug_df['pseudo_data'][init_rows:]=1

    aug_sample_weights = pd.Series(np.concatenate([np.ones(init_rows),hat/2,hat/2]))
    if R==True:
        with localconverter(ro.default_converter + pandas2ri.converter):
    
            aug_df = ro.conversion.py2ri(aug_df)
            aug_sample_weights = ro.vectors.FloatVector(aug_sample_weights)
    return aug_df, aug_sample_weights
    
    #Now run this through brglm

In [785]:
#created augmented datasets

#log-f(1,1)
# all_f, all_weights_f = logf11_aug(all_df,'y')
small_f, small_weights_f = logf11_aug(small_df,'y')
rare_f, rare_weights_f = logf11_aug(rare_df,'y')
separation_f, separation_weights_f = logf11_aug(separation_df,'y')

#FLAC
# all_FLAC, all_weights_FLAC = FLAC_aug(all_df,'y')
small_FLAC, small_weights_FLAC = FLAC_aug(small_df,'y')
rare_FLAC, rare_weights_FLAC = FLAC_aug(rare_df,'y')
separation_FLAC, separation_weights_FLAC = FLAC_aug(separation_df,'y')

#created R augmented datasets

#log-f(1,1)
# all_f, all_weights_f = logf11_aug(all_df,'y')
small_f_r, small_weights_f_r = logf11_aug(small_df,'y')
rare_f_r, rare_weights_f_r = logf11_aug(rare_df,'y')
separation_f_r, separation_weights_f_r = logf11_aug(separation_df,'y')

#FLAC
# all_FLAC, all_weights_FLAC = FLAC_aug(all_df,'y')
small_FLAC_r, small_weights_FLAC_r = FLAC_aug(small_df,'y')
rare_FLAC_r, rare_weights_FLAC_r = FLAC_aug(rare_df,'y')
separation_FLAC_r, separation_weights_FLAC_r = FLAC_aug(separation_df,'y')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return ptp(axis=axis, out=out, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  n_endog_mu = self._clean((1. - endog) / (1. - mu))


Testing Round One

In [343]:
#baseline scores
sklogit = LogisticRegression(penalty='none',solver='newton-cg')
baseline = sklogit.fit(X_train,y_train)
baseline_preds = baseline.predict(X_test)
baseline_proba = baseline.predict_proba(X_test)

print(accuracy_score(y_test,baseline_preds))
print(log_loss(y_test,baseline_proba))
print(classification_report(y_test,baseline_preds))
print(confusion_matrix(y_test,baseline_preds))

0.956140350877193
0.11430574610890282
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        47
           1       0.96      0.97      0.96        67

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

[[44  3]
 [ 2 65]]


  y = column_or_1d(y, warn=True)


In [338]:
#rare Comparison

sklogit = LogisticRegression(penalty='none',solver='newton-cg')
baseline = sklogit.fit(X,y)
control = sklogit.fit(rare_X,rare_y)
l2 = LogisticRegression()
l2_model = l2.fit(rare_X,rare_y)
logf_model = logF11(rare_df,'y')
firth_rare_pred, firth_rare_coef = get_r_firth_results(firth_rare_r,firth_all_r,firth_formula)
firth = Sigmoid_Pred(add_constant(breast_cancer_df),firth_rare_coef.Coef)

logf_preds = logf_model.predict(all_X_f[all_X_f.real_data==1])
logf_proba = logf_model.predict_proba(all_X_f[all_X_f.real_data==1])

print(accuracy_score(y, control.predict(X)))
print(accuracy_score(y, l2_model.predict(X)))
print(accuracy_score(y,logf_preds))
print(accuracy_score(target_df,firth.round()))


print(log_loss(y, control.predict_proba(X)))
print(log_loss(y, l2_model.predict_proba(X)))
print(log_loss(y,logf_proba))
print(log_loss(target_df,firth))

print(classification_report(y, control.predict(X)))
print(classification_report(y, l2_model.predict(X)))
print(classification_report(y,logf_preds))
print(classification_report(target_df,firth.round()))


print(confusion_matrix(y, control.predict(X)))
print(confusion_matrix(y, l2_model.predict(X)))
print(confusion_matrix(y,logf_preds))
print(confusion_matrix(target_df,firth.round()))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.9121265377855887
0.8523725834797891
0.8822495606326889
0.7978910369068541
1.386944549367799
0.5237417849244933
0.7192496484948084
0.5509427897257464
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       212
           1       1.00      0.86      0.92       357

    accuracy                           0.91       569
   macro avg       0.90      0.93      0.91       569
weighted avg       0.93      0.91      0.91       569

              precision    recall  f1-score   support

           0       0.72      1.00      0.83       212
           1       1.00      0.77      0.87       357

    accuracy                           0.85       569
   macro avg       0.86      0.88      0.85       569
weighted avg       0.89      0.85      0.85       569

              precision    recall  f1-score   support

           0       0.76      1.00      0.86       212
           1       1.00      0.81      0.90       357

    accuracy                   

  return ptp(axis=axis, out=out, **kwargs)


In [337]:
#small Comparison
sklogit = LogisticRegression(penalty='none',solver='newton-cg')
control = sklogit.fit(small_X,small_y)
l2 = LogisticRegression()
l2_model = l2.fit(small_X,small_y)
logf_model = logF11(small_df,'y')
firth_small_pred, firth_small_coef = get_r_firth_results(firth_small_r,firth_all_r,firth_formula)
firth = Sigmoid_Pred(add_constant(breast_cancer_df),firth_small_coef.Coef)

logf_preds = logf_model.predict(all_X_f[all_X_f.real_data==1])
logf_proba = logf_model.predict_proba(all_X_f[all_X_f.real_data==1])

print(accuracy_score(y, control.predict(X)))
print(accuracy_score(y, l2_model.predict(X)))
print(accuracy_score(y,logf_preds))
print(accuracy_score(target_df,firth.round()))


print(log_loss(y, control.predict_proba(X)))
print(log_loss(y, l2_model.predict_proba(X)))
print(log_loss(y,logf_proba))
print(log_loss(target_df,firth))

print(classification_report(y, control.predict(X)))
print(classification_report(y, l2_model.predict(X)))
print(classification_report(y,logf_preds))
print(classification_report(target_df,firth.round()))


print(confusion_matrix(y, control.predict(X)))
print(confusion_matrix(y, l2_model.predict(X)))
print(confusion_matrix(y,logf_preds))
print(confusion_matrix(target_df,firth.round()))

0.9033391915641477
0.9314586994727593
0.9525483304042179
0.8840070298769771
3.338568533003464
2.3673523264772225
0.12799869366056194
0.36682604194498686
              precision    recall  f1-score   support

           0       0.83      0.92      0.88       212
           1       0.95      0.89      0.92       357

    accuracy                           0.90       569
   macro avg       0.89      0.91      0.90       569
weighted avg       0.91      0.90      0.90       569

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       212
           1       0.96      0.93      0.94       357

    accuracy                           0.93       569
   macro avg       0.92      0.93      0.93       569
weighted avg       0.93      0.93      0.93       569

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       212
           1       0.97      0.96      0.96       357

    accuracy                 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return ptp(axis=axis, out=out, **kwargs)
