In [3]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
import numpy as np
from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import SGDClassifier
import random
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.metrics import log_loss
from statsmodels.tools.tools import add_constant
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [26]:
#Load Data
features = [i.replace(' ', '_') for i in load_breast_cancer().feature_names.tolist()]

breast_cancer_df = pd.DataFrame(load_breast_cancer().data,columns=features)
target_df = pd.DataFrame(load_breast_cancer().target, columns=['y'])
X = breast_cancer_df
y = target_df

df = pd.concat([target_df,breast_cancer_df],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=0)

In [334]:
#Create small data set and rare event data set
num_successes_for_rare = int(((((1-df.y.mean())*df.shape[0])/.95)-((1-df.y.mean())*df.shape[0]))//1)
rare_inds = sorted(list(df[df.y==0].index) + random.sample(list(df[df.y==1].index),num_successes_for_rare))
small_inds = random.sample(sorted(list(df.index)),50)
rare_df = df.iloc[rare_inds,:]
rare_X = rare_df.drop('y',axis=1)
rare_y = rare_df['y']
small_df = df.iloc[small_inds,:]
small_X = small_df.drop('y',axis=1)
small_y = small_df['y']

In [6]:
def DAP(df,y_var_name):
    '''Perform log-f(1,1) data augmentation
       Returns augmented df and observation weights'''
    
    num_rows = 2*(df.shape[1]-1)
    y_ind = df.columns.get_loc(y_var_name)
    
    aug = pd.DataFrame(0,columns=df.columns,index=(range(num_rows)))
    
    #augment y variable
    aug.iloc[range(0,num_rows,2),y_ind]=1
    y = aug[y_var_name]
    
    #augment X variables
    X = aug.drop(y_var_name,axis=1)
    for ind, rows in enumerate(range(0,X.shape[0],2)):
         X.iloc[rows:rows+2,ind]=1
    
    #bring it all together
    aug = pd.concat([y,X],axis=1)
    f_df = df.append(aug)
    
    #add offset
    f_df['real_data']=1
    f_df['real_data'][-aug.shape[0]:]=0
    
    #Calculate weights
    weights = f_df['real_data'].apply(lambda x: 0.5 if x == 0 else 1)
    return f_df, weights      

In [155]:
all_X_f,all_y_f, all_weights_f = DAP(df,'y')
small_X_f,small_y_f, small_weights_f = DAP(small_df,'y')
rare_X_f,rare_y_f, rare_weights_f = DAP(rare_df,'y')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
#Convert data for firth to R
with localconverter(ro.default_converter + pandas2ri.converter):
    
    firth_all_r = ro.conversion.py2ri(df)
    firth_small_r = ro.conversion.py2ri(small_df)
    firth_rare_r = ro.conversion.py2ri(rare_df)

In [9]:
base = importr('base')
d = {'package.dependencies': 'package_dot_dependencies',
     'package_dependencies': 'package_uscore_dependencies'}
brglm = importr('brglm',robject_translations=d)

In [15]:
def get_r_firth_results(df,formula):
    model = brglm.brglm(formula, data = df, family='binomial',pl=True)
    summary = base.summary(model)
    summary_dic = {}
    for i in range(len(summary.names)):
        try:
            summary_dic[summary.names[i]]=pandas2ri.converter.ri2py(list(summary)[i])
        except:
            pass
    columns = list(df.colnames)
    columns[0]='Intercept'
    coefs = pd.DataFrame(summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=columns)
    preds = ro.r.predict(model,firth_all_r)
    preds = list(preds)             
    return preds, coefs

In [16]:
def Sigmoid_Pred(X, weights):
    z = np.dot(X,weights)
    sig =  (1 + np.exp(-1*z))**-1
    sig = np.clip(sig,.000001,.999999)
    return sig

In [348]:
firth_formula = 'y ~ ' + " + ".join(features)
flic_formula = firth_formula + ' - 1'

In [343]:
#baseline scores
sklogit = LogisticRegression(penalty='none',solver='newton-cg')
baseline = sklogit.fit(X_train,y_train)
baseline_preds = baseline.predict(X_test)
baseline_proba = baseline.predict_proba(X_test)

print(accuracy_score(y_test,baseline_preds))
print(log_loss(y_test,baseline_proba))
print(classification_report(y_test,baseline_preds))
print(confusion_matrix(y_test,baseline_preds))

0.956140350877193
0.11430574610890282
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        47
           1       0.96      0.97      0.96        67

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

[[44  3]
 [ 2 65]]


  y = column_or_1d(y, warn=True)


In [338]:
#rare Comparison

sklogit = LogisticRegression(penalty='none',solver='newton-cg')
baseline = sklogit.fit(X,y)
control = sklogit.fit(rare_X,rare_y)
l2 = LogisticRegression()
l2_model = l2.fit(rare_X,rare_y)
logf_model = logF11(rare_df,'y')
firth_rare_pred, firth_rare_coef = get_r_firth_results(firth_rare_r,firth_all_r,firth_formula)
firth = Sigmoid_Pred(add_constant(breast_cancer_df),firth_rare_coef.Coef)

logf_preds = logf_model.predict(all_X_f[all_X_f.real_data==1])
logf_proba = logf_model.predict_proba(all_X_f[all_X_f.real_data==1])

print(accuracy_score(y, control.predict(X)))
print(accuracy_score(y, l2_model.predict(X)))
print(accuracy_score(y,logf_preds))
print(accuracy_score(target_df,firth.round()))


print(log_loss(y, control.predict_proba(X)))
print(log_loss(y, l2_model.predict_proba(X)))
print(log_loss(y,logf_proba))
print(log_loss(target_df,firth))

print(classification_report(y, control.predict(X)))
print(classification_report(y, l2_model.predict(X)))
print(classification_report(y,logf_preds))
print(classification_report(target_df,firth.round()))


print(confusion_matrix(y, control.predict(X)))
print(confusion_matrix(y, l2_model.predict(X)))
print(confusion_matrix(y,logf_preds))
print(confusion_matrix(target_df,firth.round()))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.9121265377855887
0.8523725834797891
0.8822495606326889
0.7978910369068541
1.386944549367799
0.5237417849244933
0.7192496484948084
0.5509427897257464
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       212
           1       1.00      0.86      0.92       357

    accuracy                           0.91       569
   macro avg       0.90      0.93      0.91       569
weighted avg       0.93      0.91      0.91       569

              precision    recall  f1-score   support

           0       0.72      1.00      0.83       212
           1       1.00      0.77      0.87       357

    accuracy                           0.85       569
   macro avg       0.86      0.88      0.85       569
weighted avg       0.89      0.85      0.85       569

              precision    recall  f1-score   support

           0       0.76      1.00      0.86       212
           1       1.00      0.81      0.90       357

    accuracy                   

  return ptp(axis=axis, out=out, **kwargs)


In [337]:
#small Comparison
sklogit = LogisticRegression(penalty='none',solver='newton-cg')
control = sklogit.fit(small_X,small_y)
l2 = LogisticRegression()
l2_model = l2.fit(small_X,small_y)
logf_model = logF11(small_df,'y')
firth_small_pred, firth_small_coef = get_r_firth_results(firth_small_r,firth_all_r,firth_formula)
firth = Sigmoid_Pred(add_constant(breast_cancer_df),firth_small_coef.Coef)

logf_preds = logf_model.predict(all_X_f[all_X_f.real_data==1])
logf_proba = logf_model.predict_proba(all_X_f[all_X_f.real_data==1])

print(accuracy_score(y, control.predict(X)))
print(accuracy_score(y, l2_model.predict(X)))
print(accuracy_score(y,logf_preds))
print(accuracy_score(target_df,firth.round()))


print(log_loss(y, control.predict_proba(X)))
print(log_loss(y, l2_model.predict_proba(X)))
print(log_loss(y,logf_proba))
print(log_loss(target_df,firth))

print(classification_report(y, control.predict(X)))
print(classification_report(y, l2_model.predict(X)))
print(classification_report(y,logf_preds))
print(classification_report(target_df,firth.round()))


print(confusion_matrix(y, control.predict(X)))
print(confusion_matrix(y, l2_model.predict(X)))
print(confusion_matrix(y,logf_preds))
print(confusion_matrix(target_df,firth.round()))

0.9033391915641477
0.9314586994727593
0.9525483304042179
0.8840070298769771
3.338568533003464
2.3673523264772225
0.12799869366056194
0.36682604194498686
              precision    recall  f1-score   support

           0       0.83      0.92      0.88       212
           1       0.95      0.89      0.92       357

    accuracy                           0.90       569
   macro avg       0.89      0.91      0.90       569
weighted avg       0.91      0.90      0.90       569

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       212
           1       0.96      0.93      0.94       357

    accuracy                           0.93       569
   macro avg       0.92      0.93      0.93       569
weighted avg       0.93      0.93      0.93       569

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       212
           1       0.97      0.96      0.96       357

    accuracy                 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return ptp(axis=axis, out=out, **kwargs)


In [None]:
def get_r_firth_results(df, test_df, formula):
    model = brglm.brglm(formula, data = df, family='binomial',pl=True)
    summary = base.summary(model)
    summary_dic = {}
    for i in range(len(summary.names)):
        try:
            summary_dic[summary.names[i]]=pandas2ri.converter.ri2py(list(summary)[i])
        except:
            pass
    columns = list(df.colnames)
    columns[0]='Intercept'
    coefs = pd.DataFrame(summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=columns)
    preds = ro.r.predict(model,firth_all_r)
    preds = list(preds)             
    return preds, coefs

In [None]:
def flic_step_1(df, test_df, formula):
    model = brglm.brglm(flic_formula, data = firth_small_r, family='binomial',pl=True)
    summary = base.summary(model)
    summary_dic = {}
    for i in range(len(summary.names)):
        try:
            summary_dic[summary.names[i]]=pandas2ri.converter.ri2py(list(summary)[i])
        except:
            pass
    columns = list(firth_small_r.colnames)
    coefs = pd.DataFrame(summary_dic['coefficients'],columns=(['Coef','SE','Z','P']),index=columns[1:])
    return coefs

In [390]:
test_X = add_constant(small_X)
weights = coefs.iloc[:,0]
weights['Int'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [418]:
for step in range(10000):
    y_pred = Sigmoid_Pred(test_X, weights)
    error = small_y - y_pred
    weights['Int']+= (np.dot(test_X.const,error)*0.1)/small_X.shape[0]
    if step % 1000 == 0:
        print((-small_y * np.log(y_pred) - (1 - small_y) * np.log(1 - y_pred)).mean())
        print(y_pred)
        orint

4.973584440857107
[0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999]
4.973584440857107
[0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999 0.999999
 0.999999 0.999999]
4.973584440857107
[0.999999 0.999999 0.999999 0.

In [414]:
y_pred

array([0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999,
       0.999999, 0.999999])

In [374]:
def FLIC(X, y, weights, num_steps, alpha):
    
    X = add_constant(X)
    weights['Intercept'] = 1
    for step in range(num_steps):
        y_pred = Sigmoid_Pred(X, weights)
        error = y - y_pred
        weights['Intercept']-= ((np.matmul(X.const,error))*alpha)/X.shape[0]
        # Print log-likelihood every so often
        if step % 1000 == 0:
            print(weights['Intercept'])
            print((-y * np.log(y_pred) - (1 - y) * np.log(1 - y_pred)).mean())
    return weights

In [375]:
FLIC(small_X,small_y,coefs.Coef,100000,alpha=0.01)

1.00359999
4.973584440857107
4.603589990000034
4.973584440857107
8.2035799899997
4.973584440857107
11.803569989999364
4.973584440857107
15.403559989999028
4.973584440857107
19.003549990000174
4.973584440857107
22.603539990001615
4.973584440857107
26.203529990003055
4.973584440857107
29.803519990004496
4.973584440857107
33.40350999000455
4.973584440857107
37.00349999000244
4.973584440857107
40.60348999000033
4.973584440857107
44.203479989998215
4.973584440857107
47.8034699899961
4.973584440857107
51.40345998999399
4.973584440857107
55.00344998999188
4.973584440857107
58.603439989989766
4.973584440857107
62.203429989987654
4.973584440857107
65.80341998998554
4.973584440857107
69.40340998998343
4.973584440857107
73.00339998998132
4.973584440857107
76.6033899899792
4.973584440857107
80.2033799899771
4.973584440857107
83.80336998997498
4.973584440857107
87.40335998997287
4.973584440857107
91.00334998997076
4.973584440857107
94.60333998996865
4.973584440857107
98.20332998996653
4.97358444085

mean_radius                  1.917563
mean_texture                -0.274275
mean_perimeter              -0.296357
mean_area                   -0.011428
mean_smoothness            -14.196157
mean_compactness             2.830915
mean_concavity             -18.785380
mean_concave_points         47.760462
mean_symmetry               22.138644
mean_fractal_dimension       6.489450
radius_error               -18.513415
texture_error               -1.378898
perimeter_error              1.530872
area_error                   0.044727
smoothness_error          -122.537273
compactness_error           18.670292
concavity_error            -29.494201
concave_points_error        18.803263
symmetry_error              66.043840
fractal_dimension_error    589.913428
worst_radius                 1.891662
worst_texture                0.224481
worst_perimeter             -0.110844
worst_area                  -0.006215
worst_smoothness            32.321766
worst_compactness           12.898039
worst_concav

In [304]:
def logF11(df,y_var_name):
    '''Perform log-f(1,1) data augmentation
       Returns augmented df and observation weights'''
    
    num_rows = 2*(df.shape[1]-1)
    y_ind = df.columns.get_loc(y_var_name)
    
    aug = pd.DataFrame(0,columns=df.columns,index=(range(num_rows)))
    
    #augment y variable
    aug.iloc[range(0,num_rows,2),y_ind]=1
    y = aug[y_var_name]
    
    #augment X variables
    X = aug.drop(y_var_name,axis=1)
    for ind, rows in enumerate(range(0,X.shape[0],2)):
         X.iloc[rows:rows+2,ind]=1
    
    #bring it all together
    aug = pd.concat([y,X],axis=1)
    f_df = df.append(aug)
    
    #add offset
    f_df['real_data']=1
    f_df['real_data'][-aug.shape[0]:]=0
    
    #reseparate
    X = f_df.drop(y_var_name,axis=1)
    y = f_df[y_var_name]
    #Calculate weights
    weights = f_df['real_data'].apply(lambda x: 0.5 if x == 0 else 1)
    logit = LogisticRegression(penalty='none',solver='newton-cg',fit_intercept=False)
    model = sklog.fit(X,y,sample_weight=weights)
    return model

In [None]:
#exact logit
#get all permutations of y values for given length
def get_y_combos(length)
    perms = []
    step_1 = list(combinations_with_replacement([0,1],length))
    for step in step_1:
        perms += list(set(permutations(step,5)))
    return perms
#multiply each pair of X values by its corresponding y value
ts = []
for perm in perms:
    t = [0,0]
    for i in range(len(perm)):
        t[0]+=perm[i]*ex_X.iloc[i][0].round()
        t[1]+=perm[i]*ex_X.iloc[i][1].round()
    ts.append(t)
    
#find a pair of ts where t1 = actual distributions t1
actual_y_values_ind = perms.index((1,1,0,0,1))
[t for t in ts if t[1] == ts[actual_y_values_ind][1]]