In [14]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import ase
import numpy as np
import random


from ase.cell import Cell
from dscribe.descriptors import CoulombMatrix, SineMatrix, EwaldSumMatrix, MBTR

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy.linalg as LA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


import warnings
warnings.filterwarnings('ignore')


In [3]:
DATA_PATH = './nomad2018-predict-transparent-conductors'


# PCA, basic linear regression

In [15]:
def custom_converter(entry):
    return np.array([float(x) for x in entry[1:-1].split(',')])

In [102]:
train_all_data = pd.read_csv(
    f'{DATA_PATH}/train_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)
test_all_data = pd.read_csv(
    f'{DATA_PATH}/test_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)

In [17]:
train_all_data['CoulombMatrix'][0].shape

(6400,)

In [18]:
def get_eigenspectrum(matrix):
    spectrum = LA.eigvalsh(matrix)
    spectrum = np.sort(spectrum)[::-1]
    return spectrum

In [94]:
def ewald_compute_values_fe(data, train=True, reg=None, pca_components=None):
    ewald_spectrum_list = []
    for m in data['EwaldSumMatrix']:
        ewald_spectrum_list.append(
            get_eigenspectrum(
                np.reshape(m, (80, 80))
            )
        )
    ewald_spectrum_df = pd.DataFrame(ewald_spectrum_list).astype(float)
    ewald_spectrum_df = ewald_spectrum_df.fillna(0)
    x = ewald_spectrum_df.loc[:, :].values
    x = StandardScaler().fit_transform(x)
    #y = data.loc[:, ['formation_energy_ev_natom']].values
    pca = PCA(n_components=15).fit(x)

    # PCA n_components calculation
    rolling_sum = 0
    n_components = 1
    for i, num in enumerate(pca.explained_variance_ratio_):
        rolling_sum += num
        if rolling_sum > 0.95:
            n_components = i
            break
    if not train:
        #n_components = pca_components
        n_components = 80
    print(f'Performing PCA with {n_components} components')
    pca = PCA(n_components=80)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data=principalComponents)

    if train:
        to_drop = ['id', 'formation_energy_ev_natom', 'bandgap_energy_ev',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        rf = RandomForestRegressor(n_estimators=1000, random_state=2).fit(
            dfcombined, data['formation_energy_ev_natom'])
        score = rf.score(dfcombined, data['formation_energy_ev_natom'])
        print(f'Score: {score}')
        return rf, n_components

    # ideally should not be able to run below component since the data does not have form energy and bg energy

    
    else:
        to_drop = ['id',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        rf, n_components = ewald_compute_values_fe(train_all_data)
        predicted = rf.predict(dfcombined)
        #score = rf.score(dfcombined, data['formation_energy_ev_natom'])
        #print(f'Score: {score}')
        print('Test complete')
        return predicted
    


print('train')
rf, n_components = ewald_compute_values_fe(train_all_data)
print('test')
predicted = ewald_compute_values_fe(test_all_data, train=False,
               reg=rf, pca_components=n_components)



    

train
Performing PCA with 4 components
Score: 0.9772055599282553
test
Performing PCA with 80 components
Performing PCA with 4 components
Score: 0.9772055599282553
Test complete


In [95]:
predicted = ewald_compute_values_fe(test_all_data, train=False,
               reg=rf, pca_components=80)
prediction_df = pd.DataFrame(predicted, 
             columns=['formation_energy_ev_natom'])
prediction_df

Performing PCA with 80 components
Performing PCA with 4 components
Score: 0.9772055599282553
Test complete


Unnamed: 0,formation_energy_ev_natom
0,0.189084
1,0.081008
2,0.148447
3,0.048366
4,0.162532
...,...
595,0.105243
596,0.241889
597,0.166039
598,0.234563


In [101]:
def ewald_compute_bandgap_values(data, train=True, reg=None, pca_components=None):
    ewald_spectrum_list = []
    for m in data['EwaldSumMatrix']:
        ewald_spectrum_list.append(
            get_eigenspectrum(
                np.reshape(m, (80, 80))
            )
        )
    ewald_spectrum_df = pd.DataFrame(ewald_spectrum_list).astype(float)
    ewald_spectrum_df = ewald_spectrum_df.fillna(0)
    x = ewald_spectrum_df.loc[:, :].values
    x = StandardScaler().fit_transform(x)
    #y = data.loc[:, ['formation_energy_ev_natom']].values
    pca = PCA(n_components=15).fit(x)

    # PCA n_components calculation
    rolling_sum = 0
    n_components = 1
    for i, num in enumerate(pca.explained_variance_ratio_):
        rolling_sum += num
        if rolling_sum > 0.95:
            n_components = i
            break
    if not train:
        #n_components = pca_components
        n_components = 80
    #print(f'Performing PCA with {n_components} components')
    print(f'Performing PCA with 80 components')
    pca = PCA(n_components=80)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data=principalComponents)

    if train:
        to_drop = ['id', 'formation_energy_ev_natom', 'bandgap_energy_ev',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        rf = RandomForestRegressor(n_estimators=1000, random_state=2).fit(
            dfcombined, data['bandgap_energy_ev'])
        score = rf.score(dfcombined, data['bandgap_energy_ev'])
        print(f'Score: {score}')
        return rf, n_components

    # ideally should not be able to run below component since the data does not have form energy and bg energy

    
    else:
        to_drop = ['id',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        # rf, n_components = ewald_compute_bandgap_values(train_all_data)
        predicted = reg.predict(dfcombined)
        #score = rf.score(dfcombined, data['formation_energy_ev_natom'])
        #print(f'Score: {score}')
        print('Test complete')
        return predicted
    


print('train')
rf, n_components = ewald_compute_bandgap_values(train_all_data)
print('test')
predicted_bg = ewald_compute_bandgap_values(test_all_data, train=False,
               reg=rf, pca_components=80)

train
Performing PCA with 80 components
Score: 0.9926475909060076
test
Performing PCA with 80 components
Test complete


In [97]:
predicted_bg = ewald_compute_bandgap_values(test_all_data, train=False,
               reg=rf, pca_components=80)
prediction_df_2 = pd.DataFrame(predicted_bg, 
             columns=['bandgap_energy_ev'])

Performing PCA with 80 components
Performing PCA with 80 components
Score: 0.9926475909060076
Test complete


In [98]:
id_1 = np.arange(1, 601, 1, dtype=int)
df_from_arr = pd.DataFrame({'id':id_1,'formation_energy_ev_natom':predicted,'bandgap_energy_ev':predicted_bg})
df_from_arr.head()

Unnamed: 0,id,formation_energy_ev_natom,bandgap_energy_ev
0,1,0.189084,1.656796
1,2,0.081008,3.774391
2,3,0.148447,3.423251
3,4,0.048366,2.970057
4,5,0.162532,1.615722


In [99]:
df_from_arr.to_csv("trial_submission_df_3.csv", index=False)