! pip install pandas-profiling<br>
! pip install scikit-learn<br>
! pip install CBFV<br>
! pip install numpy

    importing all the essental libraries

In [None]:
import pandas as pd
import pandas_profiling
import os                        
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('TkAgg')
import numpy as np              
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from CBFV.composition import generate_features 
from time import time 
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error        
import joblib
from joblib import load 

    Read the file

In [None]:
df = pd.read_csv('../Files/C2DB_full.csv')  
print(df)

    Selecting only the formula and band gap columns

In [None]:
df = df[['Formula', 'Band gap']].copy()    
print(df)
print(df.shape)
df.to_csv('../Files/for_bg.csv', index=False)  

    Selecting only the TMCs

In [None]:
df = pd.read_csv('../Files/for_bg.csv')
df['Class'] = ''
for index, row in df.iterrows():
    formula = row['Formula']
    
    if any(chalcogen in formula for chalcogen in ['O', 'Se', 'S', 'Te']) and \
       any(tmetal in formula for tmetal in ['Sc', 'Y', 'La', 'Ti', 'Zr', 'Hf', 'V', 'Nb', 'Ta', 'Cr', 'Mo', 'W', 'Mn', 'Tc', 'Re', 'Fe', 'Ru', 'Os', 'Co', 'Rh', 'Ir', 'Ni', 'Pd', 'Pt', 'Cu', 'Ag', 'Au', 'Zn', 'Cd', 'Hg' ]):
        
        df.at[index, 'Class'] = 'TMC'
       
    else:
        
        df.at[index, 'Class'] = 'Other'
# Create a new dataframe containing only the rows with "Transition Metal Chalcogenide" in the "Class" column
tm_df = df[df['Class'] == 'TMC']
tm_df.to_csv('../Files/uncleaned_TMC.csv', index=False)

    to drop the class column

In [None]:
df = pd.read_csv('../Files/uncleaned_TMC.csv')
df = df.drop('Class', axis=1)
df.to_csv('../Files/uncleaned_TMCs.csv', index=False)    #csv file with only TMC and Band gap (uncleaned)
print(df.shape)

    Pandas profiling to analyse the uncleaned dataframe

In [None]:
profile = pandas_profiling.ProfileReport(df)    
profile.to_file('PandasProfilingReport_bandgap.html')
profile.to_widgets()                              
profile = pandas_profiling.ProfileReport(df, title='Pandas Profiling Report', explorative=True)
os.system('PandasProfilingReport_bandgap.html')           
df.dtypes

    to clean the data

In [None]:
df = df.replace('-',0)   
print(df)
print(df.shape)
df=df[df['Band gap'].astype(float)>0]                    
df=df.drop_duplicates(subset=['Formula'],keep='first')
df.to_csv('../Files/cleaned_TMC.csv',index=False)

    Pandas profiling to analyse the cleaned dataframe

In [None]:
df=pd.read_csv('../Files/cleaned_TMC.csv')               
print(df.shape)
profile = pandas_profiling.ProfileReport(df)
profile.to_file('PandasProfilingReport_cleaned_bandgap.html')
os.system('PandasProfilingReport_cleaned_bandgap.html')

    to rename the columns

In [None]:
df = pd.read_csv('../Files/cleaned_TMC.csv') 
print(df)
df=df.rename(columns={'Band gap':'target'})
df=df.rename(columns={'Formula':'formula'})
df.to_csv('../Files/cleaned_TMC.csv',index=False)
print(df)

    to generate the features and splitting the data into train and test

In [None]:
X_train_unscaled, y_train, formulae_train, skipped_train = generate_features(df, elem_prop='oliynyk', drop_duplicates=False, extend_features=False, sum_feat=True)
SEED = 42
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X_train_unscaled, y_train, test_size=0.20, random_state=SEED)
X_train = (X_train_unscaled)
X_test = (X_test_unscaled)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)
ti = time()
model = ExtraTreesRegressor()
model.fit(X_train, y_train)
dt = time() - ti
print(f'Finished fitting the model, total time: {dt:0.2f} s')
model.fit(X_train, y_train)  

    to save the model

In [None]:
model_path = "../Files/"
model_filename = "Bandgap.joblib"
joblib.dump(model, model_path + model_filename)
y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

    to plot the predicted values and the actual values of the target from training data

In [None]:
reg_line=True                                                                #to plot the regression line
xy_max = np.max([np.max(y_train), np.max(y_pred_train)])
plot = plt.figure(figsize=(6,6))
plt.plot(y_train, y_pred_train, 'o', ms=9, mec='k', mfc='silver', alpha=0.4)
plt.plot([0, xy_max], [0, xy_max], 'k--', label='ideal')
if reg_line:
    polyfit = np.polyfit(y_train, y_pred_train, deg=1)
    reg_ys = np.poly1d(polyfit)(np.unique(y_train))
    plt.plot(np.unique(y_train),reg_ys, alpha=0.8, label='liner fit')
plt.axis('scaled')
label='gap(ev)'
plt.xlabel(f'Actual {label}')
plt.ylabel(f'Predicted {label}')
plt.title(f'Training {type(model).__name__}, r2: {r2_score(y_train, y_pred_train):0.4f}')
plt.legend(loc='upper left')
plt.savefig('../Files/TrainingEtr.png', dpi=300, bbox_inches='tight')
plt.show()

    to save the predicted values and the actual values of the target from training data

In [None]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.to_csv('../Files/pred_test.csv',index=False)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean squared error (MSE test): {mse:.2f}")
print(f"R-squared (R2 test): {r2:.2f}")
print(df)

    to plot the predicted values and the actual values of the target from testing data

In [None]:
reg_line=True                                                               
xy_max = np.max([np.max(y_test), np.max(y_pred)])
plot = plt.figure(figsize=(6,6))
plt.plot(y_test, y_pred, 'o', ms=9, mec='k', mfc='silver', alpha=0.4)
plt.plot([0, xy_max], [0, xy_max], 'k--', label='ideal')
if reg_line:
    polyfit = np.polyfit(y_test, y_pred, deg=1)
    reg_ys = np.poly1d(polyfit)(np.unique(y_test))
    plt.plot(np.unique(y_test), reg_ys, alpha=0.8, label='linear fit')
plt.axis('scaled')
label='gap(ev)'
plt.xlabel(f'Actual {label}')
plt.ylabel(f'Predicted {label}')
plt.title(f'Testing {type(model).__name__}, r2: {r2_score(y_test, y_pred):0.4f}')
plt.legend(loc='upper left')
plt.savefig('../Files/TestingEtr.png', dpi=300, bbox_inches='tight')
plt.show()

    to get new TMCs<br>
    gp = array # + Oxidation states

In [None]:
gp4 = ['Sc', 'Y', 'La']   #1, 2, 3
gp5 = ['Ti', 'Zr', 'Hf']  #1, 2, 3, 4
gp6 = ['V', 'Nb', 'Ta']   #1, 2, 3, 4, 5
gp7 = ['Cr', 'Mo', 'W']   #1, 2, 3, 4, 5, 6
val1_2 = gp4+gp5+gp6+gp7

In [None]:
chal = ['O','S','Se','Te'] #+2
combinations = []
for i in range(len(val1_2)):
    for j in range(len(chal)):
       formula = val1_2[i] + '2' + chal[j]
       combinations.append(formula)
print(combinations)
df = pd.DataFrame(combinations)
df.to_csv('../Files/M2X.csv', index=False)
print
print(df.shape)

    to load the saved  model

In [None]:
model = load('../Files/Bandgap.joblib') 
PATH = os.getcwd()
data_path = os.path.join(PATH, '../Files/M2X.csv')
df = pd.read_csv(data_path)
df
df['target'] = 0
df.columns = ['formula','target']
df.head()
print(df)
print(df.shape)

    to generate features for the new TMCs and using the saved model to predict the bandgap

In [None]:
X_unscaled, y, formulae, skipped = generate_features(df, elem_prop='oliynyk', drop_duplicates=False, extend_features=False, sum_feat=True)
scaler = StandardScaler()
X = scaler.fit_transform(X_unscaled)
y_predict = model.predict(X)
y_predict
df['target'] = y_predict
df.sort_values(by=['target'], ascending=False)
df.columns = ['formula','Bandgap']
df.to_csv('../Files/M2X_bg.csv', index=False)
print(df)
print(df.shape)