In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib notebook

In [None]:
train = pd.read_csv('train.csv')
structures = pd.read_csv('structures.csv')

### Explore all the default features

In [None]:
train.head()

In [None]:
structures.head()

In [None]:
plt.figure()
plt.hist(train.scalar_coupling_constant)
plt.show()

In [None]:
train.molecule_name.unique().size

In [None]:
type_count = train[['type', 'id']].groupby('type').count()
plt.figure()
plt.bar(type_count.index, type_count.id)
plt.show()

In [None]:
plt.figure()
sns.violinplot('type', 'scalar_coupling_constant', data=train)
plt.show()

### Make Distance Feature

In [None]:
mrg1 = pd.merge(train, structures, how='left', 
         left_on=['molecule_name', 'atom_index_0'],
         right_on=['molecule_name', 'atom_index'])
mrg1.head()

In [None]:
mrg2 = pd.merge(mrg1, structures, how='left',
                left_on=['molecule_name', 'atom_index_1'],
                right_on=['molecule_name', 'atom_index'],
                suffixes=('_0', '_1'))
mrg2.head()

In [None]:
df = mrg2.drop(columns=['atom_index_0', 'atom_index_1'])
df.head()

In [None]:
d = np.sqrt((df.x_1 - df.x_0)**2 + (df.y_1 - df.y_0)**2 + (df.z_1 - df.z_0)**2)
d.head()

In [None]:
df1 = df.assign(d=d)
df1.head()

In [None]:
df1.shape

In [None]:
df1_s = df1.loc[0:100000].copy()

### Explore Relationship Between Distance and SCC

In [None]:
plt.scatter(df1_s.loc[:,'scalar_coupling_constant'], df1_s.loc[:,'d'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', 'type', data=df1_s)
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '1JHC'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '1JHN'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '2JHH'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '2JHN'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '2JHC'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '3JHH'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '3JHN'])
plt.show()

In [None]:
sns.scatterplot('d', 'scalar_coupling_constant', data=df1.loc[df1.loc[:,'type'] == '3JHC'])
plt.show()

### Explore Other Features

In [None]:
sc_contribs = pd.read_csv('scalar_coupling_contributions.csv')
sc_contribs.head()

In [None]:
sc_contribs.describe()

### What's our error if we just get the fc?

In [None]:
fc_merge = pd.merge(train, sc_contribs, on=['molecule_name', 'atom_index_0', 'atom_index_1'])
fc_merge.head()

In [None]:
fc_err = (fc_merge.fc - fc_merge.scalar_coupling_constant) / fc_merge.scalar_coupling_constant

In [None]:
sns.boxplot(data=fc_err)

## Other data files

In [None]:
dipoles = pd.read_csv('dipole_moments.csv')
dipoles.head()

In [None]:
mulliken = pd.read_csv('mulliken_charges.csv')
mulliken.head()

In [None]:
shielding = pd.read_csv('magnetic_shielding_tensors.csv')
shielding.head()

In [None]:
potential = pd.read_csv('potential_energy.csv')
potential.head()

### Plotting Single Molecule in 3D

In [None]:
# read in each atom line by line
m_name = df1.loc[0, 'molecule_name']
m_name

In [None]:
from collections import namedtuple

In [None]:
Atom = namedtuple('Atom', ['elt', 'x', 'y', 'z'])

In [None]:
atoms = {'elt': [], 'x': [], 'y': [], 'z': []}

with open(f'structures/{m_name}.xyz') as f:
    n = f.readline()
    f.readline()
    
    for _ in range(int(n)):
        elt, x, y, z = f.readline().split()
        atoms['elt'].append(elt)
        atoms['x'].append(float(x))
        atoms['y'].append(float(y))
        atoms['z'].append(float(z))

In [None]:
atoms = pd.DataFrame(atoms)

In [None]:
# this looks like a methane molecule
plt.figure()
sns.scatterplot(x='y', y='z', hue='elt', data = atoms)

### Idea for dealing with molecules

Molecules all have different numbers of atom and types of atoms. Is there a way to reduce them all to a vector of latent features?

### Ideas for new features

1. Get nearest atoms to the pair being compared.
2. Do something with the magnetic shielding tensors. What do each of the entries of the tensor represent?
3. Get vector that points to the other molecule. Can use this with the magnetic shielding tensor?
4. Close-form equations for some types of j-coupling?
5. Other attempts at this problem?