<a href="https://colab.research.google.com/github/lstorchi/materialml/blob/main/correlations_atomic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the Excel file into a DataFrame
file_path = './data/alldata.xlsx'
props = pd.read_excel(file_path)

In [None]:
props

In [None]:
# Compute the correlation matrix
correlation_matrix = props.corr()

pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.expand_frame_repr', False)  # Do not wrap DataFrame across multiple lines
pd.set_option('display.width', None)  # Adjust width to fit DataFrame

print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import numpy as np

plt.figure(figsize=(25, 20))

def custom_color_function(x):
    # Define your color mapping logic here
    # For example, let's create a colormap that transitions from blue to white to red
    blue = (0, 0, 1)   # RGB values for blue
    white = (1, 1, 1)  # RGB values for white
    red = (1, 0, 0)    # RGB values for red

    # Define the transition points
    white_start = 0.2
    white_end = 0.8

    if x < white_start:
        # Interpolate between blue and white for x values less than white_start
        r = blue[0] + (white[0] - blue[0]) * x / white_start
        g = blue[1] + (white[1] - blue[1]) * x / white_start
        b = blue[2] + (white[2] - blue[2]) * x / white_start
    elif x < white_end:
        # Use white for x values between white_start and white_end
        r, g, b = white
    else:
        # Interpolate between white and red for x values greater than white_end
        r = white[0] + (red[0] - white[0]) * (x - white_end) / (1 - white_end)
        g = white[1] + (red[1] - white[1]) * (x - white_end) / (1 - white_end)
        b = white[2] + (red[2] - white[2]) * (x - white_end) / (1 - white_end)

    return (r, g, b)  # Return the RGB tuple for the color at each point

# Create a colormap using the custom color function
cmap = LinearSegmentedColormap.from_list('custom', [custom_color_function(i) for i in np.linspace(0, 1, 256)])

mask = np.logical_not(np.triu(np.ones_like(correlation_matrix, dtype=bool)))

# Assuming correlation_matrix is your DataFrame containing the correlation matrix
heatmap = sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap=cmap, fmt=".1f", annot_kws={"size": 10})

# Move the x-axis labels to the top
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90)  # Rotate labels for better readability
heatmap.xaxis.set_ticks_position('top')

# Clone the heatmap and move the x-axis labels to the bottom
heatmap_bottom = heatmap.twinx()
heatmap_bottom.set_xticklabels(heatmap.get_xticklabels(), rotation=90)
heatmap_bottom.xaxis.set_ticks_position('bottom')



plt.title('Correlation Heatmap')
plt.show()


In [None]:
plt.figure(figsize=(25, 20))

mask = np.logical_not(np.triu(np.ones_like(correlation_matrix, dtype=bool)))

# Assuming correlation_matrix is your DataFrame containing the correlation matrix
heatmap = sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', fmt=".1f", annot_kws={"size": 10})

# Move the x-axis labels to the top
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90)  # Rotate labels for better readability
heatmap.xaxis.set_ticks_position('top')

# Clone the heatmap and move the x-axis labels to the bottom
heatmap_bottom = heatmap.twinx()
heatmap_bottom.set_xticklabels(heatmap.get_xticklabels(), rotation=90)
heatmap_bottom.xaxis.set_ticks_position('bottom')



plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Convert the data type of the DataFrame to numeric
props_numeric = props.apply(pd.to_numeric, errors='coerce')

# Extract the property names from the column names of the DataFrame
property_names = props_numeric.columns
property_names = list(property_names)

correlation_df = pd.DataFrame(correlation_matrix)

# Convert the data type of the DataFrame to numeric
correlation_df_numeric = correlation_df.apply(pd.to_numeric, errors='coerce')

# Set the threshold for the maximum correlation coefficient
threshold = 0.8
print("Threshold: ",threshold)
print("Property, max corr ")

for i, col in enumerate(correlation_df_numeric.values.T):  # Transpose to iterate over columns
    max_corr = np.max(np.abs(col[np.arange(len(col)) != i]))  # Exclude diagonal element
    if not np.isnan(max_corr) and max_corr <= threshold:
        #print(f"Property {i}, that is {property_names[i]}, has a maximum correlation coefficient of {max_corr:.2f}")
        print(f"{property_names[i]}, {max_corr:.2f}")

In [None]:
import numpy as np
import pandas as pd

# Convert the data type of the DataFrame to numeric
props_numeric = props.apply(pd.to_numeric, errors='coerce')
# Extract the property names from the column names of the DataFrame
property_names = props_numeric.columns
property_names = list(property_names)

correlation_df = pd.DataFrame(correlation_matrix)
correlation_df_numeric = correlation_df.apply(pd.to_numeric, errors='coerce')
size = int(correlation_df.shape[0])

results = []

# method one
for i, col in enumerate(correlation_df_numeric.values):  # Transpose to iterate over columns
    avg_abs_corr = np.mean(np.abs(col[np.arange(len(col)) != i]))  # Exclude diagonal element
    print(f"Property {i}, that is {property_names[i]}, has an average correlation coefficient of {avg_abs_corr:.5f}")
    results.append((avg_abs_corr, i))

# manual
#for i in range(0,size):
    #row = correlation_df_numeric.iloc[i]
    #row_array = row.values
    #abs_row_array = np.abs(row_array)
    #if(i<=18):
       #print(i, property_names[i], "                        ", row_array[0], row_array[1], row_array[2])
    #array_sum = np.sum(np.abs(row_array))
    #print(f"Property {i}, that is {property_names[i]}, has a sum of {array_sum:.5f}")
    #avg_abs_corr = np.mean(np.abs(np.delete(row_array, i)))  # Exclude diagonal element
    #print(f"Property {i}, that is {property_names[i]}, has an average correlation coefficient of {avg_abs_corr:.5f}")
    #results.append((avg_abs_corr, i))

sorted_results = sorted(results)
print()
print()
print()
for avg_abs_corr, i in sorted_results:
    property_name = property_names[i]
    #print(f"Property {i}, that is {property_name}, has an average correlation coefficient of {avg_abs_corr:.3f}")
    print(f"{property_name}, {avg_abs_corr:.3f}")

################ TO ORDER THEM
#results = []
#for i, col in enumerate(correlation_df_numeric.values.T):  # Transpose to iterate over columns
    #avg_abs_corr = np.mean(np.abs(col[np.arange(len(col)) != i]))  # Exclude diagonal element
    #results.append((avg_abs_corr, i))

#sorted_results = sorted(results)

#for avg_abs_corr, i in sorted_results:
    #property_name = property_names[i]
    #print(f"Property {i}, that is {property_name}, has an average correlation coefficient of {avg_abs_corr:.3f}")



In [None]:
# Convert the data type of the DataFrame to numeric
props_numeric = props.apply(pd.to_numeric, errors='coerce')
# Extract the property names from the column names of the DataFrame
property_names = props_numeric.columns
property_names = list(property_names)

correlation_df = pd.DataFrame(correlation_matrix)
correlation_df_numeric = correlation_df.apply(pd.to_numeric, errors='coerce')
size = int(correlation_df.shape[0])

results = []

# method one
for i, col in enumerate(correlation_df_numeric.values):  # Transpose to iterate over columns
    max_abs_corr = np.max(np.abs(col[np.arange(len(col)) != i]))  # Exclude diagonal element
    print(f"Property {i}, that is {property_names[i]}, has a max correlation coefficient of {max_abs_corr:.5f}")
    results.append((max_abs_corr, i))

sorted_results = sorted(results)
print()
print()
print()
for max_abs_corr, i in sorted_results:
    property_name = property_names[i]
    #print(f"Property {i}, that is {property_name}, has an average correlation coefficient of {avg_abs_corr:.3f}")
    print(f"{property_name}, {max_abs_corr:.3f}")