# Preprocessing and Models Exploration

New models using unitary structure factor, its estimated error and symmetry expansion.

In [1]:
from xtalphases.data.preprocess import *
from xtalphases import __userpath__ as user_path

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas

In [3]:
a = CIFParser(filename='F:/ic-exp-uobs/dens-0/1p27_exp_uobs.cif')

In [7]:
import glob 

In [9]:
pdbs = glob.glob('F:/ic-exp-uobs/dens-035/*.pdb')

In [113]:
cifs = glob.glob('F:/ic-exp-uobs/dens-035/*.cif')

In [114]:
mw_data = user_path + '/exploration/data/pdb/mw-ext.xlsx'

In [115]:
mwext = mw_ext_parse(mw_data)

In [116]:
def big_df(pdbs, cifs, pdbheadercols, cifheadercols, phierror=True):
    big_df = pd.DataFrame()
    for (pdb, cif) in zip(pdbs, cifs):
        pdbparser = PDBParser(pdb)
        pdbparser.parse()
        pdbheader_series = pdbparser.header_to_series(columns=pdbheadercols)
        
        cifparser = CIFParser(cif)
        cifparser.parse()
        if phierror:
            cif_df = cifparser.header_refln_df(headercols=cifheader_cols, phierror=True)
        else:
            cif_df = cifparser.header_refln_df(headercols=cifheader_cols)
        cifpdb = cifpdb_df(cif_df, pdbheader_series)
        mw_sample = mwext[mwext['ID'] == cifparser.get_pdb_name().upper()].iloc[0,1]
        cifpdb['MW'] = [mw_sample]*len(cifpdb)
        cifpdb['ID'] = [cifparser.get_pdb_name()]*len(cifpdb)
        big_df = pd.concat([big_df, cifpdb])
    return big_df

In [117]:
cifheader_cols = ['crystal_system', 'name_H-M_alt', 'length_a', 
                  'length_b', 'length_c', 'angle_alpha',
                  'angle_beta', 'angle_gamma', 'volume']

In [118]:
pdbheader_cols = ['SYNCHROTRON', 'WILSON', 'MATTHEWS', 'SOLV']

In [119]:
sample_df = big_df(pdbs[:5], cifs[:5], pdbheader_cols,
                   cifheader_cols,
                   phierror=True)

In [120]:
sample_df.shape

(689629, 28)

In [123]:
sample_df['SIGUOBS'] = sample_df['SIGFOBS']/sample_df['FOBS'] * sample_df['UOBS']

In [131]:
sample_df['ID'].value_counts()

1fb5    298254
1ckq    174306
1lmq    126461
1g7g     72676
1faa     17932
Name: ID, dtype: int64

In [132]:
sample_df['MATTHEWS'].value_counts()

NULL    298254
2.84    174306
3.21    126461
2.78     72676
2.05     17932
Name: MATTHEWS, dtype: int64

In [133]:
sample_df.drop(['SYNCHROTRON', 'WILSON'], axis=1, inplace=True)

In [134]:
sample_df.columns

Index(['index_h', 'index_k', 'index_l', 'FOBS', 'SIGFOBS', 'UOBS', 'SIGUOBS',
       'FC', 'PHI', 'FOM', 'RESOL', 'pdbx_r_free_flag', 'crystal_system',
       'name_H-M_alt', 'length_a', 'length_b', 'length_c', 'angle_alpha',
       'angle_beta', 'angle_gamma', 'volume', 'PHI_ERROR', 'MATTHEWS', 'SOLV',
       'MW', 'ID'],
      dtype='object')

In [35]:
from sklearn.model_selection import StratifiedShuffleSplit

In [43]:
from sklearn.preprocessing import LabelEncoder

In [45]:
encoder = LabelEncoder()

In [135]:
cs_cat = sample_df['crystal_system']

In [136]:
cs_cat_enc = encoder.fit_transform(cs_cat)

In [137]:
sample_df['cs_enc'] = cs_cat_enc

In [138]:
split = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.2)

In [139]:
for train_index, test_index in split.split(sample_df, sample_df['cs_enc']):
    train_set = sample_df.iloc[train_index]
    test_set = sample_df.iloc[test_index]
    

In [140]:
crystal = train_set.drop(['PHI'], axis=1)

In [141]:
phases = train_set['PHI']

In [143]:
crystal.columns

Index(['index_h', 'index_k', 'index_l', 'FOBS', 'SIGFOBS', 'UOBS', 'SIGUOBS',
       'FC', 'FOM', 'RESOL', 'pdbx_r_free_flag', 'crystal_system',
       'name_H-M_alt', 'length_a', 'length_b', 'length_c', 'angle_alpha',
       'angle_beta', 'angle_gamma', 'volume', 'PHI_ERROR', 'MATTHEWS', 'SOLV',
       'MW', 'ID', 'cs_enc'],
      dtype='object')

In [144]:
crystal.drop(['FC', 'pdbx_r_free_flag', 'FOM'], axis=1, inplace=True)

In [145]:
crystal.drop(['crystal_system', 'name_H-M_alt'], axis=1, inplace=True)

In [177]:
crystal[crystal['MATTHEWS'] == 'NULL'] = 0.0

In [162]:
crystal.drop(['ID'], axis=1, inplace=True)

In [163]:
crystal.head()

Unnamed: 0,index_h,index_k,index_l,FOBS,SIGFOBS,UOBS,SIGUOBS,RESOL,volume,PHI_ERROR,MATTHEWS,SOLV,MW,cs_enc,False
103159,9,-37,5,233.734,3.82056,0.002399,3.9e-05,2.9237,597406.96,15.991204,2.84,58.0,34937.77,0,0.0
43516,-21,40,15,2.35643,52.6984,4.5e-05,0.001009,2.19897,597406.96,88.128244,2.84,58.0,34937.77,0,0.0
136,-60,36,3,70.1936,7.30353,0.002014,0.00021,1.94189,597406.96,13.513473,2.84,58.0,34937.77,0,0.0
247804,25,27,7,652.136,19.5103,0.000505,1.5e-05,4.93103,6300872.111,17.110327,,0.0,36005.36,0,0.0
110055,12,-16,24,75.1361,8.5402,0.002015,0.000229,1.97861,597406.96,16.628169,2.84,58.0,34937.77,0,0.0


In [164]:
crystal.drop(['angle_alpha', 'angle_beta', 'angle_gamma', 'length_a', 'length_b', 'length_c'], axis=1, inplace=True)

KeyError: "['angle_alpha' 'angle_beta' 'angle_gamma' 'length_a' 'length_b' 'length_c'] not found in axis"

In [178]:
crystal['MATTHEWS'].value_counts()

0.00    238616
2.84    139303
3.21    101365
2.78     58056
2.05     14363
Name: MATTHEWS, dtype: int64

#### Linear Regression

In [179]:
from sklearn.metrics import mean_squared_error

In [180]:
from sklearn.linear_model import LinearRegression

In [181]:
lin_reg = LinearRegression()

In [182]:
crystal_1h_arr = crystal.values

In [183]:
crystal_labels = phases

In [184]:
from sklearn.model_selection import cross_val_score

In [185]:
lin_reg_scores = cross_val_score(lin_reg, crystal_1h_arr, crystal_labels,
                                scoring='neg_mean_squared_error', cv=10)

In [186]:
rmse_score = np.sqrt(-lin_reg_scores)

In [187]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [188]:
display_scores(np.sqrt(-lin_reg_scores))

Scores: [104.83851945 104.14448808 104.17932663 104.43443717 104.33858859
 104.4419532  104.52094748 104.50773594 104.28188747 104.60454695]
Mean: 104.42924309457382
Standard deviation: 0.1965892924380805


In [189]:
from sklearn.ensemble import RandomForestRegressor

In [190]:
forest_reg = RandomForestRegressor(n_estimators=20, max_depth=200)

In [191]:
rfg_scores = cross_val_score(forest_reg, crystal_1h_arr, crystal_labels,
                            scoring='neg_mean_squared_error', cv=2)

In [194]:
display_scores(np.sqrt(-rfg_scores))

Scores: [109.97417132 109.59769251]
Mean: 109.78593191582513
Standard deviation: 0.18823940868062294


In [195]:
crystal.head()

Unnamed: 0,index_h,index_k,index_l,FOBS,SIGFOBS,UOBS,SIGUOBS,RESOL,volume,PHI_ERROR,MATTHEWS,SOLV,MW,cs_enc,False
103159,9.0,-37.0,5.0,233.734,3.82056,0.002399,3.9e-05,2.9237,597406.96,15.991204,2.84,58.0,34937.77,0.0,0.0
43516,-21.0,40.0,15.0,2.35643,52.6984,4.5e-05,0.001009,2.19897,597406.96,88.128244,2.84,58.0,34937.77,0.0,0.0
136,-60.0,36.0,3.0,70.1936,7.30353,0.002014,0.00021,1.94189,597406.96,13.513473,2.84,58.0,34937.77,0.0,0.0
247804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110055,12.0,-16.0,24.0,75.1361,8.5402,0.002015,0.000229,1.97861,597406.96,16.628169,2.84,58.0,34937.77,0.0,0.0


In [197]:
cif = CIFParser(filename='')

TypeError: __init__() missing 1 required positional argument: 'filename'