In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load set of uncorrelated structural properties
file_path = '/content/drive/My Drive/ML_piezo/descriptors_template.csv'
descriptors = pd.read_csv(file_path, index_col=0)

print(descriptors)

In [None]:
# Load set of uncorrelated atomic properties
file_path_2 = '/content/drive/My Drive/ML_piezo/atomic_properties_uncorr_prova.xlsx'
props = pd.read_excel(file_path_2)
props

In [None]:
# Merge the two sets, creating two sets (A- and B-sites) of atomic properties for each entry in Bastien's database

for index, row in descriptors.iterrows():
    # Extract A-site and B-site symbols
    a_site_symbol = row['A-site']
    b_site_symbol = row['B-site']

    # Find corresponding rows in props for A-site and B-site
    a_site_props = props[props['symbol'] == a_site_symbol].iloc[0]
    b_site_props = props[props['symbol'] == b_site_symbol].iloc[0]

    # Append relevant columns from props to descriptors
    for col_prefix, site_props in zip(['A_', 'B_'], [a_site_props, b_site_props]):
        for prop_col in site_props.index:
            if prop_col not in ['symbol', 'name']:  # Avoid adding 'symbol' and 'name' columns
                descriptors.at[index, f'{col_prefix}{prop_col}'] = site_props[prop_col]

print(descriptors.columns)
print()
print(descriptors)

In [None]:
# Create dict out of Bastien's list of descriptors
fp = open("/content/drive/My Drive/ML_piezo/list_structural_descriptors.txt", "r")
descdetails = {}
for l in fp:
    k = l.split(":")[0].replace("\"", "")
    v = l.split(":")[1].replace("\n", "").replace("#", "")
    descdetails[k] = v
fp.close()

In [None]:
# Recognize non-numeric values
nonvalues = ['label_structure', 'composition_pretty', 'A-site', 'B-site']
for c in nonvalues:
    print(descriptors[c].value_counts())
    print(descriptors[c].unique())

In [None]:
# Remove constant columns
for c in descriptors.columns:
    if len(descriptors[c].unique()) == 1:
        print("Removing constant column: ", c)
        descriptors.drop(c, axis=1, inplace=True)

In [None]:
corrcut = 0.90
tocrrolate = {}
for c in descriptors.columns:
    if c not in nonvalues:
        tocrrolate[c] = descriptors[c].astype(np.float64).values

toremove = set()
basicdescr = set()
tocrrolate = pd.DataFrame(tocrrolate)
print(tocrrolate)
print()

# Compute |correlation matrix| after removal of constant values, to determine what to remove at this stage
corr = tocrrolate.corr().abs()
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.expand_frame_repr', False)  # Do not wrap DataFrame across multiple lines
pd.set_option('display.width', None)  # Adjust width to fit DataFrame
print(corr)
print()
print()

for c in corr.columns:
    print(c)
    basicdescr.add(c)
    for cc in corr.columns:
        if corr[c][cc] > corrcut and c != cc and  \
           cc not in basicdescr:
            print("\t %20s %7.3f"%(cc, corr[c][cc]))
            toremove.add(cc)

print("basicdescr: ", basicdescr)
print("toremove: ", toremove)

# Compute correlation matrix after removal of constant values and plot heatmap
corr = tocrrolate.corr()
plt.figure(figsize=(18, 12))
heatmap = sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".1f",annot_kws={"size": 10})
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90)  # Rotate labels for better readability
heatmap.xaxis.set_ticks_position('top') # Move the x-axis labels to the top
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Before further removal: sort properties by max absolute correlation coefficient
tocrrolate_numeric = tocrrolate.apply(pd.to_numeric, errors='coerce')
property_names = tocrrolate_numeric.columns
property_names = list(property_names)

correlation_df = pd.DataFrame(corr)
correlation_df_numeric = correlation_df.apply(pd.to_numeric, errors='coerce')
size = int(correlation_df.shape[0])

results = []

for i, col in enumerate(correlation_df_numeric.values):  # Transpose to iterate over columns
    max_abs_corr = np.max(np.abs(col[np.arange(len(col)) != i]))  # Exclude diagonal element
    print(f"Property {i}, that is {property_names[i]}, has a max correlation coefficient of {max_abs_corr:.5f}")
    results.append((max_abs_corr, i))

sorted_results = sorted(results)
print()
print()
print()
for max_abs_corr, i in sorted_results:
    property_name = property_names[i]
    print(f"{property_name}, {max_abs_corr:.3f}")

In [None]:
# Before further removal: sort properties by avg absolute correlation coefficient
correlation_df = pd.DataFrame(corr)
correlation_df_numeric = correlation_df.apply(pd.to_numeric, errors='coerce')
size = int(correlation_df.shape[0])

results = []

for i, col in enumerate(correlation_df_numeric.values):  # Transpose to iterate over columns
    avg_abs_corr = np.mean(np.abs(col[np.arange(len(col)) != i]))  # Exclude diagonal element
    print(f"Property {i}, that is {property_names[i]}, has an avg correlation coefficient of {avg_abs_corr:.5f}")
    results.append((avg_abs_corr, i))

sorted_results = sorted(results)
print()
print()
print()
for avg_abs_corr, i in sorted_results:
    property_name = property_names[i]
    print(f"{property_name}, {avg_abs_corr:.3f}")

In [None]:
# Remove strongly correlated variables
print("Removing columns: ", toremove)
for c in toremove:
    tocrrolate.drop(c, axis=1, inplace=True)

# Compute correlation matrix again and plot the correlation heatmap
corr = tocrrolate.corr()
plt.figure(figsize=(18, 12))
heatmap = sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".1f",annot_kws={"size": 10})
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90)  # Rotate labels for better readability
heatmap.xaxis.set_ticks_position('top') # Move the x-axis labels to the top

plt.title('Correlation Heatmap')
plt.show()