# Preprocess 

* Gather info from *.pdb* and *.cif* files.
* Build a a *pandas* ```DataFrame```
* Save data externally, naming according to the sample of files that originated it.

In [1]:
from xtalphases.data.preprocess import *
from xtalphases import __userpath__ as user_path

In [2]:
import numpy as np

In [8]:
mw_data = user_path + '/exploration/data/pdb/mw-ext.xlsx'

In [9]:
user_path = 'D:/USP/IC/Repositório/xtalphases/'

## Tests

### Sample file 

In [10]:
cif_sample = user_path + '/exploration/data/phases/4was_RCSB_phases.cif'
pdb_sample = user_path + '/exploration/data/pdb/4was_RCSB.pdb'

In [11]:
cifparser = CIFParser(cif_sample)

In [12]:
cifparser.parse()

In [13]:
pdbparser = PDBParser(pdb_sample)

In [14]:
pdbparser.parse()

**Molecular Weight**

In [15]:
mwext = mw_ext_parse(mw_data)

**PDB**

In [16]:
pdb_header = pdbparser.header_to_series()

In [17]:
pdb_header

SYNCHROTRON        Y
SOLV           60.18
WILSON          NULL
MATTHEWS        3.09
dtype: object

**CIF**

In [18]:
cifheader_cols = ['crystal_system', 'name_H-M_alt', 'length_a', 
                  'length_b', 'length_c', 'angle_alpha', 
                  'angle_beta', 'angle_gamma', 'volume']

In [19]:
cif_df = cifparser.header_refln_df(headercols=cifheader_cols, phierror=True)

In [20]:
cif_df.head()

Unnamed: 0,index_h,index_k,index_l,FOBS,SIGFOBS,FMODEL,PHIMODEL,FOM,RESOL,pdbx_r_free_flag,crystal_system,name_H-M_alt,length_a,length_b,length_c,angle_alpha,angle_beta,angle_gamma,volume,PHI_ERROR
0,-87,1,8,20.1788,10.4634,32.7004,44.0551,0.740392,1.70001,0,monoclinic,C 1 2 1,147.92,106.14,94.41,90.0,98.41,90.0,1466319.626,42.235181
1,-86,0,1,24.5824,17.643,1.98733,0.0,0.084655,1.70577,0,monoclinic,C 1 2 1,147.92,106.14,94.41,90.0,98.41,90.0,1466319.626,85.143791
2,-86,0,2,22.6266,16.7682,0.701489,180.0,0.027563,1.7095,0,monoclinic,C 1 2 1,147.92,106.14,94.41,90.0,98.41,90.0,1466319.626,88.420539
3,-86,0,3,33.1398,23.5155,2.87998,0.0,0.150623,1.71267,0,monoclinic,C 1 2 1,147.92,106.14,94.41,90.0,98.41,90.0,1466319.626,81.336968
4,-86,0,4,33.7438,23.821,1.08301,180.0,0.058051,1.71529,1,monoclinic,C 1 2 1,147.92,106.14,94.41,90.0,98.41,90.0,1466319.626,86.672063


In [21]:
cif_df.columns

Index(['index_h', 'index_k', 'index_l', 'FOBS', 'SIGFOBS', 'FMODEL',
       'PHIMODEL', 'FOM', 'RESOL', 'pdbx_r_free_flag', 'crystal_system',
       'name_H-M_alt', 'length_a', 'length_b', 'length_c', 'angle_alpha',
       'angle_beta', 'angle_gamma', 'volume', 'PHI_ERROR'],
      dtype='object')

**CIF + PDB**

In [17]:
master_df = cif_df.copy()

In [18]:
for key in pdb_header.keys():
    master_df[key] = pdb_header[key]

In [19]:
master_df.columns

Index(['index_h', 'index_k', 'index_l', 'FOBS', 'SIGFOBS', 'FMODEL',
       'PHIMODEL', 'FOM', 'RESOL', 'pdbx_r_free_flag', 'crystal_system',
       'name_H-M_alt', 'length_a', 'length_b', 'length_c', 'angle_alpha',
       'angle_beta', 'angle_gamma', 'volume', 'PHI_ERROR', 'SYNCHROTRON',
       'SOLV', 'WILSON', 'MATTHEWS'],
      dtype='object')

In [20]:
master_df.head()

Unnamed: 0,index_h,index_k,index_l,FOBS,SIGFOBS,FMODEL,PHIMODEL,FOM,RESOL,pdbx_r_free_flag,...,length_c,angle_alpha,angle_beta,angle_gamma,volume,PHI_ERROR,SYNCHROTRON,SOLV,WILSON,MATTHEWS
0,-87,1,8,20.1788,10.4634,32.7004,44.0551,0.740392,1.70001,0,...,94.41,90.0,98.41,90.0,1466319.626,42.235181,Y,60.18,,3.09
1,-86,0,1,24.5824,17.643,1.98733,0.0,0.084655,1.70577,0,...,94.41,90.0,98.41,90.0,1466319.626,85.143791,Y,60.18,,3.09
2,-86,0,2,22.6266,16.7682,0.701489,180.0,0.027563,1.7095,0,...,94.41,90.0,98.41,90.0,1466319.626,88.420539,Y,60.18,,3.09
3,-86,0,3,33.1398,23.5155,2.87998,0.0,0.150623,1.71267,0,...,94.41,90.0,98.41,90.0,1466319.626,81.336968,Y,60.18,,3.09
4,-86,0,4,33.7438,23.821,1.08301,180.0,0.058051,1.71529,1,...,94.41,90.0,98.41,90.0,1466319.626,86.672063,Y,60.18,,3.09


## Building samples 

**Class ```Sample Builder```**

In [21]:
class SampleBuilder():
    
    def __init__(self, pdbs, cifs, mw):
        """ 
        Build sample with all information available on files.
        Let the processing and training step decide whether to include
        or not some features.
        """
        self.pdbs = pdbs
        self.cifs = cifs
        self.mw = mw
          
    def build_sample(self):
        sample_df = pd.DataFrame()
        for (pdb, cif) in zip(self.pdbs, self.cifs):
            pdbparser = PDBParser(pdb) # parse PDB
            pdbparser.parse()
            pdb_header = pdbparser.header_to_series() # get header as Series
            
            cifparser = CIFParser(cif) # parse CIF
            cifparser.parse()
            
            # compile CIF and PDB, including PHIERROR
            cif_pdb_df = cifparser.header_refln_df(phierror=True) 
            for key in pdb_header.keys():
                cif_pdb_df[key] = pdb_header[key]
            
            # get PDB name (it's not working for some files!)
            pdb_id = cifparser.get_pdb_name().upper()
            sample_mw = self.mw[self.mw['ID'] == pdb_id].iloc[0,1]
            cif_pdb_df['MW'] = [sample_mw]*len(cif_pdb_df)
            cif_pdb_df['ID'] = [pdb_id]*len(cif_pdb_df)
            
            # grow dataframe by concatenating
            sample_df = pd.concat([sample_df, cif_pdb_df])
        return sample_df
     

### Sample 1 

* Contains 10 of 27 structures that are stored in our *git* repo. As no more structures will be added, there is no need to keep track of the structures used.

**Load data**

In [22]:
import glob

In [23]:
pdbs = glob.glob(user_path + '/exploration/data/pdb/*RCSB.pdb')
cifs = glob.glob(user_path + '/exploration/data/phases/*RCSB_phases.cif')

**Molecular Weight**

In [24]:
mwext = mw_ext_parse(mw_data)

**Sample 1**

In [25]:
s1 = SampleBuilder(pdbs[:10], cifs[:10], mwext)

In [26]:
s1_df = s1.build_sample()

In [27]:
s1_df.head()

Unnamed: 0,index_h,index_k,index_l,FOBS,SIGFOBS,FMODEL,PHIMODEL,FOM,RESOL,pdbx_r_free_flag,...,angle_beta,angle_gamma,volume,PHI_ERROR,SYNCHROTRON,SOLV,WILSON,MATTHEWS,MW,ID
0,0,0,2,2875.68,27.1741,2655.7,180.0,1.0,38.723,0,...,90.0,90.0,358641.741,0.0,N,47.35,14.32,2.34,38400.2,4WA0
1,0,0,4,370.026,3.63324,380.329,0.0,0.99862,19.3615,0,...,90.0,90.0,358641.741,3.01042,N,47.35,14.32,2.34,38400.2,4WA0
2,0,0,6,220.973,3.39928,173.168,180.0,0.757121,12.9077,0,...,90.0,90.0,358641.741,40.788955,N,47.35,14.32,2.34,38400.2,4WA0
3,0,0,8,262.233,4.45675,18.4543,180.0,0.124482,9.68075,0,...,90.0,90.0,358641.741,82.849157,N,47.35,14.32,2.34,38400.2,4WA0
4,0,0,10,253.624,5.92648,390.35,0.0,0.988117,7.7446,0,...,90.0,90.0,358641.741,8.841617,N,47.35,14.32,2.34,38400.2,4WA0


In [28]:
s1_df.shape

(676484, 31)

**Saving as** ```.csv```

In [29]:
s1_df.to_csv(user_path + '/exploration/data/processed/sample1.csv')

### Sample 2

* Contains 27 of 27 structures that are stored in our *git* repo.

In [32]:
s2 = SampleBuilder(pdbs, cifs, mwext)

In [33]:
s2_df = s2.build_sample()

In [34]:
s2_df.head()

Unnamed: 0,index_h,index_k,index_l,FOBS,SIGFOBS,FMODEL,PHIMODEL,FOM,RESOL,pdbx_r_free_flag,...,length_a,length_b,length_c,angle_alpha,angle_beta,angle_gamma,volume,PHI_ERROR,MW,ID
0,0,0,2,2875.68,27.1741,2655.7,180.0,1.0,38.723,0,...,61.507,75.29,77.446,90.0,90.0,90.0,358641.741,0.0,38400.2,4WA0
1,0,0,4,370.026,3.63324,380.329,0.0,0.99862,19.3615,0,...,61.507,75.29,77.446,90.0,90.0,90.0,358641.741,3.01042,38400.2,4WA0
2,0,0,6,220.973,3.39928,173.168,180.0,0.757121,12.9077,0,...,61.507,75.29,77.446,90.0,90.0,90.0,358641.741,40.788955,38400.2,4WA0
3,0,0,8,262.233,4.45675,18.4543,180.0,0.124482,9.68075,0,...,61.507,75.29,77.446,90.0,90.0,90.0,358641.741,82.849157,38400.2,4WA0
4,0,0,10,253.624,5.92648,390.35,0.0,0.988117,7.7446,0,...,61.507,75.29,77.446,90.0,90.0,90.0,358641.741,8.841617,38400.2,4WA0


In [35]:
s2_df.shape

(1195772, 27)

**Saving as** ```.csv```

In [36]:
s2_df.to_csv(user_path + '/exploration/data/processed/sample2.csv')

### Sample 3

* Contains $79$ structures uniformly sampled from the $10^5$ first. I will save it on my computer to keep a backup.

In [30]:
MAX_INDEX = 10**5

In [31]:
s3_size = 100

In [32]:
s3_rcsb = np.array(glob.glob('F:/INPUT/*RCSB.pdb'))[:MAX_INDEX]

**Sampling PDB**

In [55]:
np.random.seed = 4

In [56]:
s3_sampled_pdb_files = np.random.choice(s3_rcsb, size=s3_size, replace=False)

**Get CIFs and pair with PDBs**

In [57]:
s3_cif = np.array(glob.glob('F:/OUTPUT/*/2-PHASES_CIF/*RCSB_phases.cif'))

In [58]:
s3_cif_files = []
s3_pdb_files = []

In [59]:
for pdb in s3_sampled_pdb_files:
    filename = os.path.split(pdb)[-1][:-4] + '_phases.cif'
    for cif in s3_cif:
        if filename in cif:
            s3_cif_files.append(cif)
            s3_pdb_files.append(pdb)
            break

In [60]:
len(s3_cif_files)

79

In [61]:
len(s3_pdb_files)

79

**Using build sample**

In [62]:
s3 = SampleBuilder(s3_pdb_files, s3_cif_files, mwext)

In [63]:
s3_df = s3.build_sample()

**Saving as** ```.csv```

In [64]:
s3_df.to_csv(user_path + '/exploration/data/processed/sample3.csv')

### Sample 4

In [72]:
s4_size = 50

In [73]:
s4_rcsb = np.array(glob.glob('F:/INPUT/*RCSB.pdb'))[:MAX_INDEX]

**Sampling PDB**

In [74]:
np.random.seed = 4

In [75]:
s4_sampled_pdb_files = np.random.choice(s4_rcsb, size=s4_size, replace=False)

**Get CIFs and pair with PDBs**

In [76]:
s4_cif = np.array(glob.glob('F:/OUTPUT/*/2-PHASES_CIF/*RCSB_phases.cif'))

In [77]:
s4_cif_files = []
s4_pdb_files = []

In [78]:
for pdb in s4_sampled_pdb_files:
    filename = os.path.split(pdb)[-1][:-4] + '_phases.cif'
    for cif in s4_cif:
        if filename in cif:
            s4_cif_files.append(cif)
            s4_pdb_files.append(pdb)
            break

In [79]:
len(s4_cif_files)

34

In [80]:
len(s4_pdb_files)

34

**Using build sample**

In [81]:
s4 = SampleBuilder(s4_pdb_files, s4_cif_files, mwext)

In [82]:
s4_df = s4.build_sample()

**Saving as** ```.csv```

In [83]:
s4_df.to_csv(user_path + '/exploration/data/processed/sample4.csv')

## Evaluating performance

Using ```timeit``` library to compare if building the dataframe from start is more performant than loading it. 

* Test the code along 10 executions.

In [84]:
import timeit 

In [85]:
def build_df(pdbs, cifs, mwext):
    return SampleBuilder(pdbs, cifs, mwext).build_sample()

In [86]:
def load_df(filename):
    return pd.read_csv(filename)

### Sample 1

**Building**

In [87]:
timeit.repeat('build_df(s1_pdbs, s1_cifs, mw)', setup='from __main__ import build_df, pdbs, cifs, mwext; s1_pdbs = pdbs[:10]; s1_cifs = cifs[:10]; mw = mwext',
             repeat=5, number=1)

[17.842001334000088,
 17.360007184000096,
 17.71039513599999,
 17.253738903000112,
 17.201121773000068]

**Loading**

In [88]:
s1_filename = user_path + '/exploration/data/processed/sample1.csv'

In [89]:
timeit.repeat('load_df(file)', setup='from __main__ import load_df, s1_filename; file = s1_filename',
             repeat=5, number=1)

[12.84332631999996,
 12.670949864000022,
 12.89070098000002,
 13.347422897000115,
 12.584156616999962]

### Sample 2

**Building**

In [90]:
timeit.repeat('build_df(s2_pdbs, s2_cifs, mw)', setup='from __main__ import build_df, pdbs, cifs, mwext; s2_pdbs = pdbs; s2_cifs = cifs; mw = mwext',
             repeat=5, number=1)

[43.92777441399994,
 42.76588648799998,
 41.66368912600001,
 41.67722481599981,
 41.349960999999894]

**Loading**

In [91]:
s2_filename = user_path + '/exploration/data/processed/sample2.csv'

In [92]:
timeit.repeat('load_df(file)', setup='from __main__ import load_df, s2_filename; file = s2_filename',
             repeat=5, number=1)

[23.32826961600017,
 21.041737568000144,
 20.13232000799985,
 19.771335762000035,
 19.382442653999988]

### Sample 3

**Building**

In [93]:
timeit.repeat('build_df(s3_pdbs, s3_cifs, mw)', setup='from __main__ import build_df, s3_pdb_files, s3_cif_files, mwext; s3_pdbs = s3_pdb_files; s3_cifs = s3_cif_files; mw = mwext',
             repeat=5, number=1)

[245.11618412200005,
 243.9052316929999,
 244.4624888389999,
 231.70197567600007,
 230.30998614500004]

In [24]:
a = np.array([245.11618412200005,
 243.9052316929999,
 244.4624888389999,
 231.70197567600007,
 230.30998614500004])

In [25]:
a.mean()

239.09917329499999

In [26]:
a.std()

6.633793991585731

**Loading**

In [94]:
s3_filename = user_path + '/exploration/data/processed/sample3.csv'

In [95]:
timeit.repeat('load_df(file)', setup='from __main__ import load_df, s3_filename; file = s3_filename',
             repeat=5, number=1)

[87.17082362600013,
 80.62643711200008,
 74.85143942100012,
 74.65908657099999,
 73.83560732800015]

In [28]:
a = np.array([87.17082362600013,
 80.62643711200008,
 74.85143942100012,
 74.65908657099999,
 73.83560732800015])

In [29]:
a.mean(), a.std()

(78.22867881160009, 5.082479915836953)

### Sample 4

**Building**

In [96]:
timeit.repeat('build_df(s4_pdbs, s4_cifs, mw)', setup='from __main__ import build_df, s4_pdb_files, s4_cif_files, mwext; s4_pdbs = s4_pdb_files; s4_cifs = s4_cif_files; mw = mwext',
             repeat=5, number=1)

[50.57140144799996,
 47.47726330400019,
 46.16879509399996,
 45.926410428000054,
 45.1438846420001]

In [32]:
a = np.array([24.54291168499958,
 24.504560475999824,
 24.17345871600037,
 23.89132242000005,
 23.90044491200024])

In [33]:
a.mean(), a.std()

(24.202539641800012, 0.2814365993418935)

**Loading**

In [97]:
s4_filename = user_path + '/exploration/data/processed/sample4.csv'

In [98]:
timeit.repeat('load_df(file)', setup='from __main__ import load_df, s4_filename; file = s4_filename',
             repeat=5, number=1)

[24.54291168499958,
 24.504560475999824,
 24.17345871600037,
 23.89132242000005,
 23.90044491200024]

---

## Conclusions 

* Keep PHIERROR in ```preprocessing.py```. No need to include this after loading (it would increase our processing time; moreover, if it's not need, just drop that column).
* Some PDB and CIF files contains no useful information and are formatted in a way that cannot be parsed by our classes. 
    * Problems in getting cif name (returning ```Out of index error``` because no name was found).
    * Problems in interpreting uncommon chars (returning enconding errors). Tried to fix by manually setting the enconding on ```_gather_data```. 
* Saving the dataframes as ```.csv``` improves loading time significantly.


**Future Objectives in this step**

* Think about number of decimal places and the data representation (```float64, int32```)
* Improve regular expressions (e.g., verify if ```findall``` or loading entire file as string is truly needed).
* Search for new storage libraries (e.g., optmized pandas for large datasets or another library)