In [2]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import ase
import numpy as np
import random


from ase.cell import Cell
from dscribe.descriptors import CoulombMatrix, SineMatrix, EwaldSumMatrix, MBTR

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy.linalg as LA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


import warnings
warnings.filterwarnings('ignore')


In [3]:
DATA_PATH = './nomad2018-predict-transparent-conductors'


In [4]:
def custom_converter(entry):
    return np.array([float(x) for x in entry[1:-1].split(',')])

In [5]:
train_all_data = pd.read_csv(
    f'{DATA_PATH}/train_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)
test_all_data = pd.read_csv(
    f'{DATA_PATH}/test_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)

In [6]:
train_all_data['CoulombMatrix'][0].shape

(6400,)

# Basic linear regression

In [7]:
def get_eigenspectrum(matrix):
    spectrum = LA.eigvalsh(matrix)
    spectrum = np.sort(spectrum)[::-1]
    return spectrum

In [8]:
def create_matrix_df(data,pca_components=None,train=True):
    ewald_spectrum_list = []
    for m in data['EwaldSumMatrix']:
        ewald_spectrum_list.append(
            get_eigenspectrum(
                np.reshape(m, (80, 80))
            )
        )
    ewald_spectrum_df = pd.DataFrame(ewald_spectrum_list).astype(float)
    ewald_spectrum_df = ewald_spectrum_df.fillna(0)
    x = ewald_spectrum_df.loc[:, :].values
    x = StandardScaler().fit_transform(x)
    #y = data.loc[:, ['formation_energy_ev_natom']].values
    pca = PCA(n_components=15).fit(x)

    # PCA n_components calculation
    rolling_sum = 0
    n_components = 1
    for i, num in enumerate(pca.explained_variance_ratio_):
        rolling_sum += num
        if rolling_sum > 0.95:
            n_components = i
            break
    
    if not train:
        n_components = pca_components

    print(f'Performing PCA with {n_components} components')
    pca = PCA(n_components)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data=principalComponents)
    if train:
        to_drop = ['id', 'formation_energy_ev_natom', 'bandgap_energy_ev',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
    else:
        to_drop = ['id',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
    return dfcombined, n_components



In [22]:

def compute_final_values(train_all_data, test_all_data, target_column, model):
    # train
    dfcombined, n_components = create_matrix_df(train_all_data)
    rf = model.fit(dfcombined, train_all_data[target_column])
    score = rf.score(dfcombined, train_all_data[target_column])
    print(f'Training score: {score}')
    # test
    dfcombined, n_components = create_matrix_df(test_all_data, train=False, pca_components=n_components)
    predicted = rf.predict(dfcombined)
    return predicted


## Experiment 1: Random forest

In [None]:

pred_fe = compute_final_values(
    train_all_data, 
    test_all_data, 
    target_column='formation_energy_ev_natom',
    model = RandomForestRegressor(n_estimators=1000, random_state=2)
)
pred_bandgap = compute_final_values(
    train_all_data, 
    test_all_data, 
    target_column='bandgap_energy_ev',
    model = RandomForestRegressor(n_estimators=1000, random_state=2)
)

In [None]:
pred_fe = compute_final_values(
    train_all_data, 
    test_all_data, 
    target_column='formation_energy_ev_natom',
    model = RandomForestRegressor(n_estimators=1000, random_state=2)
)
pred_bandgap = compute_final_values(
    train_all_data, 
    test_all_data, 
    target_column='bandgap_energy_ev',
    model = RandomForestRegressor(n_estimators=1000, random_state=2)
)

In [19]:
id_1 = np.arange(1, len(pred_fe)+1, 1, dtype=int)
submission_df = pd.DataFrame({'id':id_1,'formation_energy_ev_natom':pred_fe,'bandgap_energy_ev':pred_bandgap})
submission_df.head()


Unnamed: 0,id,formation_energy_ev_natom,bandgap_energy_ev
0,1,0.190086,1.6364
1,2,0.068089,3.826807
2,3,0.146527,3.60357
3,4,0.032716,3.026442
4,5,0.129898,1.609366


In [20]:
submission_df.to_csv("trial_submission_df_3.csv", index=False)