# Models

Test some models on the dataframe to get a performance reference for further studies.

In [3]:
from xtalphases.data.preprocess import *
from xtalphases import __userpath__ as user_path

ModuleNotFoundError: No module named 'xtalphases'

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Data

Getting data using ```CIFParser``` and ```PDBParser```.

In [None]:
import glob

In [None]:
n_exp = 27

In [None]:
pdbs = glob.glob(user_path + '/exploration/data/pdb/*RCSB.pdb')[:n_exp]
cifs = glob.glob(user_path + '/exploration/data/phases/*RCSB_phases.cif')[:n_exp]
mw_data = user_path + '/exploration/data/pdb/mw-ext.xlsx'

In [None]:
mwext = mw_ext_parse(mw_data)

In [None]:
len(pdbs)

In [None]:
def big_df(pdbs, cifs, pdbheadercols, cifheadercols, phierror=True):
    big_df = pd.DataFrame()
    for (pdb, cif) in zip(pdbs, cifs):
        pdbparser = PDBParser(pdb)
        pdbparser.parse()
        pdbheader_series = pdbparser.header_to_series(columns=pdbheadercols)
        
        cifparser = CIFParser(cif)
        cifparser.parse()
        if phierror:
            cif_df = cifparser.header_refln_df(headercols=cifheader_cols, phierror=True)
        else:
            cif_df = cifparser.header_refln_df(headercols=cifheader_cols)
        cifpdb = cifpdb_df(cif_df, pdbheader_series)
        mw_sample = mwext[mwext['ID'] == cifparser.get_pdb_name().upper()].iloc[0,1]
        cifpdb['MW'] = [mw_sample]*len(cifpdb)
        cifpdb['ID'] = [cifparser.get_pdb_name()]*len(cifpdb)
        big_df = pd.concat([big_df, cifpdb])
    return big_df

In [None]:
cifheader_cols = ['crystal_system', 'name_H-M_alt', 'length_a', 
                  'length_b', 'length_c', 'angle_alpha',
                  'angle_beta', 'angle_gamma', 'volume']

In [None]:
pdbheadercols = ['SYNCHROTRON', 'WILSON', 'MATTHEWS', 'SOLV']

In [None]:
sample_df = big_df(pdbs, cifs, pdbheadercols, cifheader_cols, phierror=True)

In [None]:
sample_df.columns

## Visualizing

For each crystal system in the sample, plot:
* Indices distribution (take care of the negative indices too!)
* Model phases and errors (plot centered on centric reflections angles).

Other useful plots and information:
* Scatter plot between Wilson and Matthews coefficient (subsidize if PCA is needed)
* For each crystal system, calculate correlation coefficients.
* Find the fraction of centric reflections inside the dataset

Perhaps, we just need a fraction of reflections with well defined phases to obtain a electronic density map.

In [None]:
list(sample_df.crystal_system.unique())

In [None]:
test = sample_df.loc[lambda sample_df: sample_df['crystal_system']=='orthorhombic']

In [None]:
def crystalsys_plot_indexes(df):
    crystal_systems = list(df.crystal_system.unique())
    for cs in crystal_systems:
        new_df = df.loc[lambda df: df['crystal_system']==cs]
        fig, ax = plt.subplots(1, 3, sharey=True)
        ax[0].hist(new_df['index_h'], bins=10, rwidth=0.85, color='crimson', alpha=0.8, density=True)
        ax[1].hist(new_df['index_k'], bins=10, rwidth=0.85, color='k', density=True)
        ax[2].hist(new_df['index_l'], bins=10, rwidth=0.85, color='darkolivegreen',  density=True)
        fig.suptitle(cs.capitalize(), fontsize=14)
    plt.show()

**Indexes Distribution**

In [None]:
crystalsys_plot_indexes(sample_df)

**Model Phases**

In [None]:
def crystalsys_plot_phases(df):
    crystal_systems = list(df.crystal_system.unique())
    fig, ax = plt.subplots(1, len(crystal_systems), sharey=True, figsize=(12, 8))
    colors = iter(['crimson', 'k', 'darkolivegreen', 'mediumvioletred', 'orangered'])
    for (num, cs) in enumerate(crystal_systems):
        new_df = df.loc[lambda df: df['crystal_system']==cs]
        ax[num].hist(new_df['PHIMODEL'], bins=10, rwidth=0.85, color='#607c8e', alpha=0.7, density=True)
        ax[num].set_title(cs.capitalize())
    plt.show()

In [None]:
crystalsys_plot_phases(sample_df)

**Wilson and Matthew Correlation**

In [None]:
wilson = sample_df.loc[:, 'WILSON']

In [None]:
mathews = sample_df.loc[:, 'MATTHEWS']

In [None]:
matthews_wilson_df = pd.DataFrame({'MATTHEWS':mathews, 'WILSON':wilson})

In [None]:
matthews_wilson_df.drop_duplicates(inplace=True)

In [None]:
matthews_wilson_df.replace(to_replace='NULL', value=np.nan, inplace=True)

In [None]:
matthews_wilson_df.dropna(inplace=True)

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
plt.plot(matthews_wilson_df['WILSON'], matthews_wilson_df['MATTHEWS'], 'o', color='k')
ax = plt.gca()
ax.set_xlabel('WILSON')
ax.set_ylabel('MATTHEWS')
plt.show()

### Train and Test 

* Sampling strategy: get the same amount of structures of each crystal system.

**Wilson Coefficient**

In [None]:
sample_df.replace(to_replace='NULL', value=np.nan, inplace=True)

Many structure doesn't have WILSON coefficient available on ```.pdb``` file!

In [None]:
sample_df.drop('WILSON', axis=1, inplace=True)

In [None]:
sample_df.isnull().sum().sum()

**Sampling**

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
cs_cat = sample_df['crystal_system']

In [None]:
cs_cat_enc = encoder.fit_transform(cs_cat); cs_cat_enc

In [None]:
encoder.classes_

In [None]:
sample_df['cs_encoded'] = cs_cat_enc

In [None]:
sample_df.cs_encoded.value_counts()/len(sample_df)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=0.2)

In [None]:
for train_index, test_index in split.split(sample_df, sample_df['cs_encoded']):
    train_set = sample_df.iloc[train_index]
    test_set = sample_df.iloc[test_index]

In [None]:
columns = ['Dataset', 'Train', 'Test']
prop = pd.DataFrame(columns=columns)
prop['Dataset'] = sample_df['crystal_system'].value_counts()/len(sample_df)
prop['Train'] = train_set['crystal_system'].value_counts()/len(train_set)
prop['Test'] = test_set['crystal_system'].value_counts()/len(test_set)
prop

In [None]:
len(test_set)/len(sample_df), len(train_set)/len(sample_df)

## Processing


In [None]:
crystal = train_set.drop('PHIMODEL', axis=1)

In [None]:
crystal_labels = train_set['PHIMODEL'].copy()

### Encoding

In [None]:
crystal.drop(['FOM', 'pdbx_r_free_flag', 'cs_encoded'], axis=1, inplace=True)

In [None]:
crystal.drop(['ID'], axis=1, inplace=True)

In [None]:
to_encode = ['SYNCHROTRON', 'crystal_system', 'name_H-M_alt']

In [None]:
crystal_1h =  pd.get_dummies(crystal, columns=to_encode)

In [None]:
crystal_1h.head(10)

In [None]:
crystal_labels

### Scaling

I won't scale the data here, mainly because of sparseness (gaps in data) associated with small sample we have.

## Training Models

* Metric: RMSE of phases (perhaps not the best option to evaluate error in this case).

#### Linear Regression

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()

In [None]:
crystal_1h_arr = crystal_1h.values

In [None]:
lin_reg.fit(crystal_1h_arr, crystal_labels)

In [None]:
lin_reg.coef_

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
lin_reg_scores = cross_val_score(lin_reg, crystal_1h_arr, crystal_labels,
                                scoring='neg_mean_squared_error', cv=10)

In [None]:
rmse_score = np.sqrt(-lin_reg_scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
display_scores(lin_reg_scores)

#### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg = DecisionTreeRegressor()

In [None]:
tree_reg.fit(crystal_1h_arr, crystal_labels)

In [None]:
crystal_pred = tree_reg.predict(crystal_1h)

In [None]:
tree_mse = mean_squared_error(crystal_labels, crystal_pred)

In [None]:
tree_rmse = np.sqrt(tree_mse)

In [None]:
tree_rmse

**Remark:** overffiting!

In [None]:
scores = cross_val_score(tree_reg, crystal_1h_arr, 
                        crystal_labels, scoring='neg_mean_squared_error',
                        cv=5)

In [None]:
rmse_tree_reg_scores = np.sqrt(-scores)

In [None]:
display_scores(rmse_tree_reg_scores)

In [None]:
a = crystal_1h_arr[500]

In [None]:
tree_reg.predict([a])

In [None]:
crystal_labels.iloc[500]

#### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = RandomForestRegressor()

In [None]:
forest_reg.fit(crystal_1h_arr, crystal_labels)

In [None]:
forest_reg_pred = forest_reg.predict(crystal_1h_arr)

In [None]:
forest_mse = mean_squared_error(forest_reg_pred, crystal_labels)

In [None]:
forest_scores_rmse = np.sqrt(forest_mse)

In [None]:
display_scores(forest_scores_rmse)

In [None]:
n = 19090

In [None]:
a = crystal_1h_arr[n]

In [None]:
forest_reg.predict([a])

In [None]:
crystal_labels.iloc[n]

**Cross-validation with big dataset in RFG**

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
rfg_scores = cross_val_score(forest_reg, crystal_1h_arr, crystal_labels,
                            scoring='neg_mean_squared_error', cv=3)

In [None]:
rmse_scores_rfg = np.sqrt(-rfg_scores)

In [None]:
display_scores(rmse_scores_rfg)

**Feature Importances in the RFG**

In [None]:
sorted(zip(forest_reg.feature_importances_, crystal_1h.columns), reverse=True)

### Saving the trained models

In [None]:
from sklearn.externals import joblib

In [None]:
path_to_models = user_path + '/models/'

In [None]:
import os

In [None]:
def save_model(model, filename, path=path_to_models):
    joblib.dump(model, os.path.join(path, filename))

In [None]:
save_model(forest_reg, 'forest_reg_phases_test.pkl')

In [None]:
joblib.load(path_to_models + 'forest_reg_phases_test.pkl')

In [None]:
joblib.dump(tree_reg, 'tree_reg_phases.pkl')

In [None]:
joblib.dump(lin_reg, 'lin_reg_phases.pkl')

In [None]:
model = joblib.load('forest_reg_phases.pkl')

**Improved Model Saving**

Improve model saving to name files explicitly.

In [None]:
path_to_models = path_to_models

In [None]:
joblib.dump(forest_reg, path_to_models + 'forest_reg.pkl')

## Final Model Test