In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
import pickle
import patsy
import numpy as np

In [2]:
rootdir = "../data/gc/"

In [3]:
tsvs = [i for i in os.listdir(rootdir) if i.endswith(".tsv")]

In [4]:
data = pd.read_csv(f"{rootdir}{tsvs[0]}", sep="\t")

In [5]:
for i in tsvs[1:]:
    tmp = pd.read_csv(f"{rootdir}{i}", sep="\t")
    data.append(tmp)

In [6]:
print(data.columns)

Index(['genome', 'readlength', 'damage', 'repeat', 'simuCov',
       'simuContigLength', 'reference', 'null_model_p0', 'null_model_p0_stdev',
       'damage_model_p', 'damage_model_p_stdev', 'damage_model_pmin',
       'damage_model_pmin_stdev', 'damage_model_pmax',
       'damage_model_pmax_stdev', 'pvalue', 'qvalue', 'RMSE',
       'nb_reads_aligned', 'coverage', 'actualCov', 'CtoT-0', 'CtoT-1',
       'CtoT-2', 'CtoT-3', 'CtoT-4', 'GtoA-0', 'GtoA-1', 'GtoA-2', 'GtoA-3',
       'GtoA-4', 'seed', 'GCcontent'],
      dtype='object')


In [7]:
data = data[['qvalue','damage','simuCov','simuContigLength','GCcontent']]

In [8]:
data = data.loc[data['qvalue'].notna(),:]

## Defining `sig` categorial variable
`True` if `qvalue`<0.05  
`False` if `qvalue`>= 0.05

In [9]:
data['sig'] = pd.cut(data['qvalue'],[0, 0.05, 1], labels=[True,False], include_lowest=True)

In [10]:
data = data.astype({'simuCov':'category','simuContigLength':'category'})
data

Unnamed: 0,qvalue,damage,simuCov,simuContigLength,GCcontent,sig
274,1.0,0.0,1-2,2000-5000,0.4181,False
326,1.0,0.0,1-2,5000-10000,0.3747,False
473,1.0,0.0,1-2,10000-20000,0.4440,False
502,1.0,0.0,1-2,20000-50000,0.4589,False
606,1.0,0.0,1-2,50000-100000,0.4828,False
...,...,...,...,...,...,...
80995,0.0,0.2,200-500,200000-500000,0.4766,True
80996,0.0,0.2,200-500,200000-500000,0.4770,True
80997,0.0,0.2,200-500,200000-500000,0.4788,True
80998,0.0,0.2,200-500,200000-500000,0.4692,True


In [11]:
data.dtypes

qvalue               float64
damage               float64
simuCov             category
simuContigLength    category
GCcontent            float64
sig                 category
dtype: object

In [12]:
data = data.drop('qvalue', axis=1)

### Reordering the categories

In [13]:
data['simuContigLength'].cat.reorder_categories(['500-1000',
                                                 '1000-2000',
                                                 '2000-5000',
                                                 '5000-10000', 
                                                 '10000-20000',
                                                 '20000-50000',
                                                 '50000-100000',
                                                 '100000-200000', 
                                                 '200000-500000'], inplace=True)

In [14]:
data['simuCov'].cat.reorder_categories(['1-2',
                                        '2-3',
                                        '3-5',
                                        '5-10',
                                        '10-20',
                                        '20-50',
                                        '50-100',
                                        '100-200', 
                                        '200-500'], inplace=True)

## Model formula

In [15]:
formula = 'sig ~ damage + C(simuCov) + C(simuContigLength) + GCcontent'

## Creating the GLM logistic model

In [16]:
model_call = smf.glm(formula= formula, data=data, family=sm.families.Binomial())

In [17]:
model = model_call.fit()

In [18]:
print(model.summary())

                      Generalized Linear Model Regression Results                      
Dep. Variable:     ['sig[True]', 'sig[False]']   No. Observations:                70645
Model:                                     GLM   Df Residuals:                    70626
Model Family:                         Binomial   Df Model:                           18
Link Function:                           logit   Scale:                          1.0000
Method:                                   IRLS   Log-Likelihood:                -15525.
Date:                         Fri, 18 Sep 2020   Deviance:                       31050.
Time:                                 08:24:45   Pearson chi2:                 1.73e+05
No. Iterations:                              8                                         
Covariance Type:                     nonrobust                                         
                                           coef    std err          z      P>|z|      [0.025      0.975]
---------------

### Saving the model

In [19]:
with open("../models/accuracy_model_python.pickle", 'wb') as mod:
    pickle.dump(model, mod)

### Test data

In [20]:
d = pd.Series(['200-500','1000-2000','0.03','0.56']).to_frame(name='NZ_JHCB02000014.1').transpose()

In [21]:
d.columns = ['simuCov','simuContigLength','damage','GCcontent']
d = d.astype({'simuCov':'category','simuContigLength':'category', 'damage':float, 'GCcontent':float})

In [22]:
d['simuCov'] = d['simuCov'].cat.set_categories(['1-2',
                                        '2-3',
                                        '3-5',
                                        '5-10',
                                        '10-20',
                                        '20-50',
                                        '50-100',
                                        '100-200', 
                                        '200-500'])
d['simuContigLength'] = d['simuContigLength'].cat.set_categories(['500-1000',
                                                 '1000-2000',
                                                 '2000-5000',
                                                 '5000-10000', 
                                                 '10000-20000',
                                                 '20000-50000',
                                                 '50000-100000',
                                                 '100000-200000', 
                                                 '200000-500000'])

In [23]:
d

Unnamed: 0,simuCov,simuContigLength,damage,GCcontent
NZ_JHCB02000014.1,200-500,1000-2000,0.03,0.56


### Making inference on test data

In [25]:
model.predict(d)

NZ_JHCB02000014.1    0.947411
dtype: float64