In [2]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sys.path.append('./..')

In [3]:
data_path = r"D:\tfg\data\3DSC"
file = '3DSC_ICSD_only_IDs.csv'
file_path = os.path.join(data_path, file)
print(file_path)


D:\tfg\data\3DSC\3DSC_ICSD_only_IDs.csv


In [4]:
supercon_data = pd.read_csv(file_path, skiprows=1)
supercon_data.columns = ['chemical_formula', 'critical_temperature_k', 'ICSD', 'synth_doped']
supercon_data['ICSD'] = supercon_data['ICSD'].str.replace('ICSD-', '')
supercon_data

Unnamed: 0,chemical_formula,critical_temperature_k,ICSD,synth_doped
0,Ag0.002Al0.998,1.128000,604645,True
1,Ag0.005Zn0.995,0.763000,107744,True
2,Ag0.02Ge2Pd1.98Sr1,2.640000,165985,True
3,Ag0.07Sn0.9Te1,1.730000,19128,True
4,Ag0.153Ba1.99Cu2.847Y1O6.7,86.000000,202498,True
...,...,...,...,...
86485,Zr1,0.996667,76154,False
86486,Zr1,0.996667,43700,False
86487,Zr1,0.996667,164572,False
86488,Zr1,0.996667,253515,False


In [5]:
data_raw_path = r"D:\tfg\data\data_raw"

# Lista para almacenar los datos de cada archivo
datos_archivos = []

for carpeta in os.listdir(data_raw_path):
    ruta_carpeta = os.path.join(data_raw_path, carpeta)
    
    # Verifica si es una carpeta
    if os.path.isdir(ruta_carpeta):
        # Itera sobre los archivos dentro de la carpeta
        for archivo in os.listdir(ruta_carpeta):
            # Verifica si el archivo es un archivo de texto
            if archivo.endswith('.txt'):
                # Divide el nombre del archivo por guiones bajos
                partes_nombre = archivo.replace('.txt', '').split('_')
                
                # Extrae la información de la red y la fórmula
                red, formula = partes_nombre[:2]
                
                # Extrae el número si hay al menos 4 partes
                if len(partes_nombre) >= 4:
                    numero = partes_nombre[3]
                else:
                    numero = 'unknown'
                
                # Agrega los datos a la lista
                datos_archivos.append((red, formula, numero))

# Crea un DataFrame a partir de los datos recopilados
aflowlib_data = pd.DataFrame(datos_archivos, columns=['bravais_lattice', 'material_formula', 'ICSD'])

aflowlib_data

Unnamed: 0,bravais_lattice,material_formula,ICSD
0,BCC,Bi4Ge3O12,260560
1,BCC,Fe5O12Y3,28561
2,BCC,P3Rh1,43724
3,BCC,Mn1,44932
4,BCC,Rh7Sb6Yb4,409885
...,...,...,...
60336,TRI,Br2Co1O3Sb2,418858
60337,TRI,H8Mg1O12P2Zr1,183237
60338,TRI,H40Ho6N8O49,418822
60339,TRI,Fe2H3Na1O8S2,61211


In [29]:
supercon_data.ICSD.value_counts()

ICSD
78209     367
78252     367
78230     367
56526     366
78234     366
         ... 
253430      1
100666      1
102109      1
102285      1
653531      1
Name: count, Length: 9950, dtype: int64

In [6]:
aflowlib_data.ICSD.value_counts()

ICSD
unknown    116
000          3
39242        3
76670        3
74520        2
          ... 
164964       1
107122       1
102016       1
648440       1
421599       1
Name: count, Length: 59966, dtype: int64

In [13]:
aflowlib_data[aflowlib_data['ICSD']=='unknown']

Unnamed: 0,bravais_lattice,material_formula,ICSD
633,BCC,Eu0,unknown
756,BCC,Hf0,unknown
826,BCC,Re0,unknown
934,BCC,H4,unknown
1125,BCC,B2N4Na0,unknown
...,...,...,...
57580,TET,I1Nb3,unknown
57613,TET,Nd1Se1,unknown
58011,TET,Mn1O2,unknown
58342,TET,Cu1F1Na0,unknown


In [27]:
merged_data_test = pd.merge(supercon_data, aflowlib_data[aflowlib_data=='unknown'], left_on='chemical_formula', right_on='material_formula', how='inner')
merged_data_test

Unnamed: 0,chemical_formula,critical_temperature_k,ICSD_x,synth_doped,bravais_lattice,material_formula,ICSD_y


In [23]:
merged_data = pd.merge(supercon_data, aflowlib_data, on='ICSD', how='inner')

merged_data

Unnamed: 0,chemical_formula,critical_temperature_k,ICSD,synth_doped,bravais_lattice,material_formula
0,Ag0.02Ge2Pd1.98Sr1,2.640000,165985,True,BCT,Ge2Pd2Sr1
1,Ag0.15Sn0.85Te1,2.150000,652741,True,FCC,Sn1Te1
2,Ag0.1Ge2Pd1.9Sr1,2.620000,165985,True,BCT,Ge2Pd2Sr1
3,Ag0.1In0.9Te1,1.200000,640614,True,FCC,In1Te1
4,Ag0.2Ba1Si1.8,3.200000,602228,True,CUB,Ba1Si2
...,...,...,...,...,...,...
14781,Zr1,0.996667,76168,False,BCC,Zr1
14782,Zr1,0.996667,76154,False,HEX,Zr1
14783,Zr1,0.996667,43700,False,HEX,Zr1
14784,Zr1,0.996667,164572,False,HEX,Zr1


In [8]:
merged_data['ICSD'].value_counts()

ICSD
74145     189
169741    139
169555    139
180479    139
180478    139
         ... 
629666      1
41835       1
35134       1
600471      1
653531      1
Name: count, Length: 4900, dtype: int64

In [15]:
merged_data[merged_data['ICSD']=='74145']

Unnamed: 0,chemical_formula,critical_temperature_k,ICSD,synth_doped,bravais_lattice,material_formula
6192,Ba0.004Cu1La1.996O4,0.0,74145,True,ORCC,Cu1La2O4
6193,Ba0.014Cu1La1.986O4,0.0,74145,True,ORCC,Cu1La2O4
6194,Ba0.016Cu1La1.984O4,0.0,74145,True,ORCC,Cu1La2O4
6195,Ba0.01Cu1La1.99O4,0.0,74145,True,ORCC,Cu1La2O4
6196,Ba0.024Cu1La1.976O4,0.0,74145,True,ORCC,Cu1La2O4
...,...,...,...,...,...,...
9114,Cu1La2O4.13,31.0,74145,True,ORCC,Cu1La2O4
9115,Cu1La2O4.18,0.0,74145,True,ORCC,Cu1La2O4
9116,Cu1La2O4.2,0.0,74145,True,ORCC,Cu1La2O4
9117,Cu1La2O4.32,37.5,74145,True,ORCC,Cu1La2O4


In [9]:
icsd_supercon_merged_unique = merged_data['ICSD'].unique()

n_supercon = len(icsd_supercon_merged_unique)
n_data = len(aflowlib_data)

print(f'- Superconducotores: {n_supercon}')
print(f'- Materiales: {n_data}')
print(f'- Porcentaje supercond: {(n_supercon/n_data*100):.2f}%')

- Superconducotores: 4900
- Materiales: 60341
- Porcentaje supercond: 8.12%


Para algunos materiales de aflowlib tengo varias posibles formulas quimicas. Tengo entonces 4900 superconductores en mi database original de 60341 materiales.