In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import plotly.express as px

In [None]:
import sys
libs = [sns, np, sklearn, pd, matplotlib]
for i in libs:
    print(i.__version__)

In [None]:
from platform import python_version
python_version()

## Exploratory data analysis

In [None]:
dg = pd.read_csv(r'C:\Users\evg_ondar\Prediction-of-Pt-NMR-chem-shifts\dataset\qtaim_pt_dataset.csv', sep=';')
dois = dg['Source_doi']
smiles = dg['Smiles']
dg = dg.drop(['Source_doi', 'Smiles'], axis=1)
dg.info()

In [None]:
shifts = dg['Shift']

In [None]:
sns.set_style(style="white")
sns.set_context("notebook")

_, ax = plt.subplots(figsize=(7,5))
fig_ = sns.countplot(y='Charge_class', data=dg, palette='bright')
sns.despine(offset=10, trim=True)
ax.bar_label(ax.containers[0])
fig_.set(xlabel='Count', ylabel= 'Charge')
# plt.savefig('charge_count.png', dpi=300)
plt.show()

In [None]:
for column in dg.select_dtypes(include='object'):
    if dg[column].nunique() < 10:
        _, ax = plt.subplots(figsize=(7,5))
        fig_ = sns.countplot(ax=ax, y=column, data=dg, palette='bright')
        sns.despine(offset=10, trim=True)
        ax.bar_label(ax.containers[0])
        fig_.set(xlabel='Count', ylabel= column)
#         plt.savefig('{}.png'.format(column), dpi=300, bbox_inches='tight')
        plt.show()

In [None]:
sns.set_style(style="white")
sns.set_context("notebook")

for column in dg.select_dtypes(include='object'):
    if dg[column].nunique() < 10:
        
        swarm_plot, ax = plt.subplots(figsize=(6,6))
        swarm_plot = sns.violinplot(y=column, x='Shift', data=dg, palette='bright')
#         sns.stripplot(y=column, x='Shift', data=dg, color='gray', size=3)
        sns.despine(offset=10, trim=True)
        swarm_plot.set(xlabel='\u03B4, ppm', ylabel= column)

        plt.show()
        fig = swarm_plot.get_figure()
        #fig.savefig('{}.png'.format(column), dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots()
dg.hist('Shift', ax=ax, bins=15, grid=False, figsize=(7,5), zorder=2, rwidth=0.9)
plt.xlabel('\u03B4, ppm')
plt.ylabel('Count')
# plt.savefig('shift_hist.png', dpi=300)

In [None]:
dg.hist(figsize=(30,30), xrot=45)
plt.show()

In [None]:
dg = dg.drop(['Shift'], axis=1)

In [None]:
corrs = dg.corr()
plt.figure(figsize=(40,40))
sns.heatmap(corrs, cmap='RdBu_r', annot=True)
plt.savefig('qtaim_corr_matrix.png', dpi=200)
plt.show()

In [None]:
#Detection of too correlated features. Threshold = 0.95
upper = corrs.where(np.triu(np.ones(corrs.shape), k=1).astype(bool))
strong_corr = [column for column in upper.columns if any(upper[column] > 0.95)]
dg = dg.drop(strong_corr, axis=1)
strong_corr

In [None]:
dg.isnull().sum()

In [None]:
dg['Shift'] = shifts
dg['Smiles'] = smiles
dg['Source_doi'] = dois
dg.info()

In [None]:
dg.to_csv('no_corr_qtaim_pt_dataset.csv', index=False)

## Visualization

In [None]:
dg.columns

### PCA

In [None]:
features = ['Rho at Nucleus', 'DelSqRho', 'V', 'G', 'L', 'Vnuc', 'Ven', 'Vrep',
       'DelSqVen', 'DelSqVrep', 'DelSqK', 'ESP', 'ESPe', 'q(A)', 'L(A)',
       'K(A)','|Mu_Intra(A)|', 'Molecular dipole', 'Shift']

df = dg
scaler = StandardScaler()

df[features] = scaler.fit_transform(df[features])
pca = PCA()
components = pca.fit_transform(df[features])
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

n = 3
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n)]
initial_feature_names = features
print('Initial', initial_feature_names)
most_important_names = [initial_feature_names[most_important[i]] for i in range(n)]
print('PCA', most_important_names)

fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(n),
    color=df['Geometry']
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['Geometry'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['Ligand'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=df['Charge_class'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

### t-SNE

In [None]:
tsne = TSNE(n_components=3, random_state=0)
projections = tsne.fit_transform(df[features], )

fig = px.scatter_3d(
    projections, x=0, y=1, z=2,
    color=df.Ligand
)
fig.update_traces(marker_size=8)
fig.show()

In [None]:
tsne = TSNE(n_components=3, random_state=0)
projections = tsne.fit_transform(df[features], )

fig = px.scatter_3d(
    projections, x=0, y=1, z=2,
    color=df.Geometry
)
fig.update_traces(marker_size=8)
fig.show()

In [None]:
tsne = TSNE(n_components=3, random_state=0)
projections = tsne.fit_transform(df[features], )

fig = px.scatter_3d(
    projections, x=0, y=1, z=2,
    color=df.Charge_class
)
fig.update_traces(marker_size=8)
fig.show()