In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
import pickle
import patsy
import numpy as np
import pickle

In [2]:
data = pd.read_csv("../data/PYD_simulation_results.csv", sep="\t")

In [3]:
data.columns

Index(['genome', 'readlength', 'damage', 'simuCov', 'simuContigLength',
       'reference', 'contiglength', 'GCcontent', 'medianRL', 'null_model_p0',
       'null_model_p0_stdev', 'damage_model_p', 'damage_model_p_stdev',
       'damage_model_pmin', 'damage_model_pmin_stdev', 'damage_model_pmax',
       'damage_model_pmax_stdev', 'pvalue', 'qvalue', 'RMSE',
       'nb_reads_aligned', 'coverage', 'actualCov', 'CtoT-0', 'CtoT-1',
       'CtoT-2', 'CtoT-3', 'CtoT-4', 'GtoA-0', 'GtoA-1', 'GtoA-2', 'GtoA-3',
       'GtoA-4'],
      dtype='object')

In [4]:
data.dtypes

genome                      object
readlength                  object
damage                     float64
simuCov                     object
simuContigLength            object
reference                   object
contiglength                 int64
GCcontent                  float64
medianRL                     int64
null_model_p0              float64
null_model_p0_stdev        float64
damage_model_p             float64
damage_model_p_stdev       float64
damage_model_pmin          float64
damage_model_pmin_stdev    float64
damage_model_pmax          float64
damage_model_pmax_stdev    float64
pvalue                     float64
qvalue                     float64
RMSE                       float64
nb_reads_aligned             int64
coverage                   float64
actualCov                  float64
CtoT-0                     float64
CtoT-1                     float64
CtoT-2                     float64
CtoT-3                     float64
CtoT-4                     float64
GtoA-0              

In [5]:
print(data.columns)

Index(['genome', 'readlength', 'damage', 'simuCov', 'simuContigLength',
       'reference', 'contiglength', 'GCcontent', 'medianRL', 'null_model_p0',
       'null_model_p0_stdev', 'damage_model_p', 'damage_model_p_stdev',
       'damage_model_pmin', 'damage_model_pmin_stdev', 'damage_model_pmax',
       'damage_model_pmax_stdev', 'pvalue', 'qvalue', 'RMSE',
       'nb_reads_aligned', 'coverage', 'actualCov', 'CtoT-0', 'CtoT-1',
       'CtoT-2', 'CtoT-3', 'CtoT-4', 'GtoA-0', 'GtoA-1', 'GtoA-2', 'GtoA-3',
       'GtoA-4'],
      dtype='object')


In [6]:
data = data[['qvalue','damage_model_pmax','coverage','contiglength']]

In [7]:
data = data.loc[data['qvalue'].notna(),:]

## Defining `sig` categorial variable
`True` if `qvalue`<0.05  
`False` if `qvalue`>= 0.05

In [8]:
data['sig'] = pd.cut(data['qvalue'],[0, 0.05, 1], labels=[True,False], include_lowest=True)

In [9]:
data

Unnamed: 0,qvalue,damage_model_pmax,coverage,contiglength,sig
1896,0.46821,0.17678,3.23761,3068,False
2186,1.00000,0.00007,4.04900,22469,False
2671,1.00000,0.17678,7.22360,3068,False
2985,1.00000,0.01588,7.58392,22469,False
3483,1.00000,0.17678,13.59224,3068,False
...,...,...,...,...,...
701995,0.00000,0.30366,201.63903,346944,True
701996,0.00000,0.30444,260.31416,273518,True
701997,0.00000,0.30184,310.00277,232161,True
701998,0.00000,0.30351,209.11812,287978,True


In [10]:
data.dtypes

qvalue                float64
damage_model_pmax     float64
coverage              float64
contiglength            int64
sig                  category
dtype: object

In [11]:
data = data.drop('qvalue', axis=1)

In [12]:
data.rename(columns={'damage_model_pmax':'damage'}, inplace=True)

### Reordering the categories

## Model formula

In [13]:
formula = 'sig ~ coverage + damage + contiglength'

## Creating the GLM logistic model

In [14]:
model_call = smf.glm(formula= formula, data=data, family=sm.families.Binomial())

In [15]:
model = model_call.fit()

In [16]:
print(model.summary())

                      Generalized Linear Model Regression Results                      
Dep. Variable:     ['sig[True]', 'sig[False]']   No. Observations:               610730
Model:                                     GLM   Df Residuals:                   610726
Model Family:                         Binomial   Df Model:                            3
Link Function:                           logit   Scale:                          1.0000
Method:                                   IRLS   Log-Likelihood:            -2.0981e+05
Date:                         Mon, 09 Nov 2020   Deviance:                   4.1963e+05
Time:                                 16:23:35   Pearson chi2:                 8.62e+11
No. Iterations:                              9                                         
Covariance Type:                     nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------

### Saving the model

In [17]:
model.save("../models/accuracy_model_v2_python.pickle", remove_data=True)



### Test data

In [18]:
d = pd.Series([200,1000,0.03]).to_frame(name='NZ_JHCB02000014.1').transpose()

In [19]:
d.columns = ['coverage','contiglength','damage']

In [20]:
d

Unnamed: 0,coverage,contiglength,damage
NZ_JHCB02000014.1,200.0,1000.0,0.03


### Making inference on test data

In [21]:
model.predict(d)

NZ_JHCB02000014.1    0.979393
dtype: float64

In [22]:
with open("../models/accuracy_model_v2_python.pickle", 'rb') as mod_p:
    model2 = pickle.load(mod_p)

In [23]:
model2.predict(d)

NZ_JHCB02000014.1    0.979393
dtype: float64