<a href="https://colab.research.google.com/github/mipypf/practical-mi-guide/blob/develop/chapter4/src/feature_generation_cif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 無機材料の特徴量をCIFから生成する

### 右上の「接続」をクリックし、ランタイムに接続

## ライブラリをインストール

In [1]:
! pip install matminer==0.9.3

Collecting matminer==0.9.3
  Downloading matminer-0.9.3-py3-none-any.whl.metadata (4.9 kB)
Collecting pymongo~=4.5 (from matminer==0.9.3)
  Downloading pymongo-4.11.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting monty>=2023 (from matminer==0.9.3)
  Downloading monty-2025.3.3-py3-none-any.whl.metadata (3.6 kB)
Collecting pymatgen>=2023 (from matminer==0.9.3)
  Downloading pymatgen-2025.3.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting ruamel.yaml (from monty>=2023->matminer==0.9.3)
  Downloading ruamel.yaml-0.18.10-py3-none-any.whl.metadata (23 kB)
Collecting palettable>=3.3.3 (from pymatgen>=2023->matminer==0.9.3)
  Downloading palettable-3.3.3-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting pybtex>=0.24.0 (from pymatgen>=2023->matminer==0.9.3)
  Downloading pybtex-0.24.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting spglib>=2.5 (from pymatgen>=2023->matminer==0.9.3)
  Downloading spglib-2.6.0-cp3

### 「ランタイム」タブから「セッションを再起動する」を選択し、「はい」をクリック

## Google Colabの準備

In [1]:
# Google Colabを利用する場合はTrue、そうでない場合はFalseとする
colab = True

In [2]:
# Google Colabのファイルをクリックし、material_projects_api_cif_1000.zip、material_projects_api_cif_1000.csvをドラッグ＆ドロップしてアップロード
if colab:
  INPUT_FILE_PATH = "./"
  OUTPUT_FILE_PATH = "./"
else:
  INPUT_FILE_PATH = "../input/"
  OUTPUT_FILE_PATH = "../output/"

In [3]:
# localで実行の場合はINPUT_FILE_PATH配下にmaterial_projects_api_cif_1000.zipを配置する
import zipfile

# ファイルを解凍する
with zipfile.ZipFile(INPUT_FILE_PATH + "material_projects_api_cif_1000.zip", "r") as zip_ref:
    zip_ref.extractall("./")

## ライブラリをインポート

In [4]:
import gc
import glob
import os

import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()

import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
from matminer.featurizers.structure import DensityFeatures
from pymatgen.core.structure import Structure

# データを読み込み、CIFからの特徴量生成の準備を行う

In [5]:
# material_projects_api_cif_1000.csvを読み込む
df = pd.read_csv(
    INPUT_FILE_PATH + "material_projects_api_cif_1000.csv"
)
df

Unnamed: 0,formula,id,band_gap,formation_energy,energy_above_hull
0,Cs3Nd(MoO4)3,mp-573369,3.7419,-2.414818,0.000000
1,Ba(PdS2)2,mp-28967,0.7792,-1.124079,0.000000
2,Na7Ta3Zn(SiO4)6,mp-2712904,2.5286,-2.912354,0.049678
3,TiZnBiO5,mp-1042447,0.0000,-2.236384,0.077777
4,Na2YNb3Si4(SO12)2,mp-2715697,1.8105,-2.837049,0.079832
...,...,...,...,...,...
995,NaHfMg14,mp-1026645,0.0000,0.123330,0.123330
996,CeGa3Ni,mp-1226580,0.0000,-0.499538,0.035687
997,ReRuCl2,mp-631417,0.0000,1.034857,1.818772
998,Li2NiH8(CO5)2,mp-771916,4.1397,-1.559949,0.052192


In [6]:
# material_projects_api_cif_1000に格納されているファイルを確認
files = os.listdir(INPUT_FILE_PATH + "material_projects_api_cif_1000")
len(files)

1000

In [7]:
# material_projects_api_cif_1000に格納されているファイルの名前をfiles_dfとしてデータフレーム化
files_df = pd.DataFrame(files, columns=["file_name"])
files_df

Unnamed: 0,file_name
0,cif_mp-2715587_NaNbSi2SnSO12.cif
1,cif_mp-1190493_NiH5CN(ClO)2.cif
2,cif_mp-2218111_MgTi(WO4)2.cif
3,cif_mp-1194227_H3CO3.cif
4,cif_mp-759602_Sb4O5F2.cif
...,...
995,cif_mp-20750_Tb3SnC.cif
996,cif_mp-1099261_CsMg6GaO8.cif
997,cif_mp-10792_DyBO3.cif
998,cif_mp-22893_PbI2.cif


In [8]:
# files_dfにidを追加
files_df["id"] = files_df["file_name"].str.extract(r"mp-(\d+)")
files_df["id"] = "mp-" + files_df["id"]
files_df

Unnamed: 0,file_name,id
0,cif_mp-2715587_NaNbSi2SnSO12.cif,mp-2715587
1,cif_mp-1190493_NiH5CN(ClO)2.cif,mp-1190493
2,cif_mp-2218111_MgTi(WO4)2.cif,mp-2218111
3,cif_mp-1194227_H3CO3.cif,mp-1194227
4,cif_mp-759602_Sb4O5F2.cif,mp-759602
...,...,...
995,cif_mp-20750_Tb3SnC.cif,mp-20750
996,cif_mp-1099261_CsMg6GaO8.cif,mp-1099261
997,cif_mp-10792_DyBO3.cif,mp-10792
998,cif_mp-22893_PbI2.cif,mp-22893


In [9]:
# dfとfiles_dfを結合
merged_df = pd.merge(df, files_df, on='id', how='inner')
merged_df


Unnamed: 0,formula,id,band_gap,formation_energy,energy_above_hull,file_name
0,Cs3Nd(MoO4)3,mp-573369,3.7419,-2.414818,0.000000,cif_mp-573369_Cs3Nd(MoO4)3.cif
1,Ba(PdS2)2,mp-28967,0.7792,-1.124079,0.000000,cif_mp-28967_Ba(PdS2)2.cif
2,Na7Ta3Zn(SiO4)6,mp-2712904,2.5286,-2.912354,0.049678,cif_mp-2712904_Na7Ta3Zn(SiO4)6.cif
3,TiZnBiO5,mp-1042447,0.0000,-2.236384,0.077777,cif_mp-1042447_TiZnBiO5.cif
4,Na2YNb3Si4(SO12)2,mp-2715697,1.8105,-2.837049,0.079832,cif_mp-2715697_Na2YNb3Si4(SO12)2.cif
...,...,...,...,...,...,...
995,NaHfMg14,mp-1026645,0.0000,0.123330,0.123330,cif_mp-1026645_NaHfMg14.cif
996,CeGa3Ni,mp-1226580,0.0000,-0.499538,0.035687,cif_mp-1226580_CeGa3Ni.cif
997,ReRuCl2,mp-631417,0.0000,1.034857,1.818772,cif_mp-631417_ReRuCl2.cif
998,Li2NiH8(CO5)2,mp-771916,4.1397,-1.559949,0.052192,cif_mp-771916_Li2NiH8(CO5)2.cif


In [10]:
# merged_dfにCIFのファイルパスを追加
merged_df["file_path"] = INPUT_FILE_PATH + "material_projects_api_cif_1000/" + merged_df["file_name"]
merged_df


Unnamed: 0,formula,id,band_gap,formation_energy,energy_above_hull,file_name,file_path
0,Cs3Nd(MoO4)3,mp-573369,3.7419,-2.414818,0.000000,cif_mp-573369_Cs3Nd(MoO4)3.cif,./material_projects_api_cif_1000/cif_mp-573369...
1,Ba(PdS2)2,mp-28967,0.7792,-1.124079,0.000000,cif_mp-28967_Ba(PdS2)2.cif,./material_projects_api_cif_1000/cif_mp-28967_...
2,Na7Ta3Zn(SiO4)6,mp-2712904,2.5286,-2.912354,0.049678,cif_mp-2712904_Na7Ta3Zn(SiO4)6.cif,./material_projects_api_cif_1000/cif_mp-271290...
3,TiZnBiO5,mp-1042447,0.0000,-2.236384,0.077777,cif_mp-1042447_TiZnBiO5.cif,./material_projects_api_cif_1000/cif_mp-104244...
4,Na2YNb3Si4(SO12)2,mp-2715697,1.8105,-2.837049,0.079832,cif_mp-2715697_Na2YNb3Si4(SO12)2.cif,./material_projects_api_cif_1000/cif_mp-271569...
...,...,...,...,...,...,...,...
995,NaHfMg14,mp-1026645,0.0000,0.123330,0.123330,cif_mp-1026645_NaHfMg14.cif,./material_projects_api_cif_1000/cif_mp-102664...
996,CeGa3Ni,mp-1226580,0.0000,-0.499538,0.035687,cif_mp-1226580_CeGa3Ni.cif,./material_projects_api_cif_1000/cif_mp-122658...
997,ReRuCl2,mp-631417,0.0000,1.034857,1.818772,cif_mp-631417_ReRuCl2.cif,./material_projects_api_cif_1000/cif_mp-631417...
998,Li2NiH8(CO5)2,mp-771916,4.1397,-1.559949,0.052192,cif_mp-771916_Li2NiH8(CO5)2.cif,./material_projects_api_cif_1000/cif_mp-771916...


## CIFから特徴量を生成する

In [11]:
crystal_train = []

for i in range(0,len(files)):
  with open(merged_df['file_path'].values[i], 'r') as f:
    cif_content = f.read()
  crystal_tmp = Structure.from_str(cif_content, fmt = "cif")
  crystal_train.append(crystal_tmp)

merged_df['structure'] = crystal_train

In [12]:
merged_df

Unnamed: 0,formula,id,band_gap,formation_energy,energy_above_hull,file_name,file_path,structure
0,Cs3Nd(MoO4)3,mp-573369,3.7419,-2.414818,0.000000,cif_mp-573369_Cs3Nd(MoO4)3.cif,./material_projects_api_cif_1000/cif_mp-573369...,"[[ 1.63145925 0.23570193 10.77716588] Cs, [1...."
1,Ba(PdS2)2,mp-28967,0.7792,-1.124079,0.000000,cif_mp-28967_Ba(PdS2)2.cif,./material_projects_api_cif_1000/cif_mp-28967_...,"[[4.55327196 5.11963834 2.01929037] Ba, [1.798..."
2,Na7Ta3Zn(SiO4)6,mp-2712904,2.5286,-2.912354,0.049678,cif_mp-2712904_Na7Ta3Zn(SiO4)6.cif,./material_projects_api_cif_1000/cif_mp-271290...,"[[5.74553181 0.18487825 5.49029458] Na, [1.846..."
3,TiZnBiO5,mp-1042447,0.0000,-2.236384,0.077777,cif_mp-1042447_TiZnBiO5.cif,./material_projects_api_cif_1000/cif_mp-104244...,"[[4.62933005 3.74678864 6.11501235] Ti, [3.152..."
4,Na2YNb3Si4(SO12)2,mp-2715697,1.8105,-2.837049,0.079832,cif_mp-2715697_Na2YNb3Si4(SO12)2.cif,./material_projects_api_cif_1000/cif_mp-271569...,"[[ 4.45385419 7.65516804 11.47379283] Na, [ 4..."
...,...,...,...,...,...,...,...,...
995,NaHfMg14,mp-1026645,0.0000,0.123330,0.123330,cif_mp-1026645_NaHfMg14.cif,./material_projects_api_cif_1000/cif_mp-102664...,"[[-0.00959284 1.87776076 1.28696084] Na, [-1..."
996,CeGa3Ni,mp-1226580,0.0000,-0.499538,0.035687,cif_mp-1226580_CeGa3Ni.cif,./material_projects_api_cif_1000/cif_mp-122658...,"[[0. 0. 0.] Ce, [ 2.08237107 2.39787734 -5.56..."
997,ReRuCl2,mp-631417,0.0000,1.034857,1.818772,cif_mp-631417_ReRuCl2.cif,./material_projects_api_cif_1000/cif_mp-631417...,"[[0. 0. 0.] Re, [2.5076145 1.77315122 4.34331..."
998,Li2NiH8(CO5)2,mp-771916,4.1397,-1.559949,0.052192,cif_mp-771916_Li2NiH8(CO5)2.cif,./material_projects_api_cif_1000/cif_mp-771916...,"[[ 1.63225055 4.37071098 11.62767295] Li, [4...."


In [13]:
# 例としてmerged_dfのはじめのCIFを表示
merged_df['structure'][0]

Structure Summary
Lattice
    abc : 6.525837 9.536025 26.422264
 angles : 90.0 90.0 90.0
 volume : 1644.2718829861558
      A : np.float64(6.525837) np.float64(0.0) np.float64(3.995922696903683e-16)
      B : np.float64(1.5335097790382647e-15) np.float64(9.536025) np.float64(5.839131246419569e-16)
      C : np.float64(0.0) np.float64(0.0) np.float64(26.422264)
    pbc : True True True
PeriodicSite: Cs0 (Cs) (1.631, 0.2357, 10.78) [0.25, 0.02472, 0.4079]
PeriodicSite: Cs1 (Cs) (1.631, 5.004, 2.434) [0.25, 0.5247, 0.09212]
PeriodicSite: Cs2 (Cs) (4.894, 9.3, 15.65) [0.75, 0.9753, 0.5921]
PeriodicSite: Cs3 (Cs) (4.894, 4.532, 23.99) [0.75, 0.4753, 0.9079]
PeriodicSite: Cs4 (Cs) (1.631, 0.9904, 21.67) [0.25, 0.1039, 0.8201]
PeriodicSite: Cs5 (Cs) (1.631, 5.758, 17.96) [0.25, 0.6039, 0.6799]
PeriodicSite: Cs6 (Cs) (4.894, 8.546, 4.753) [0.75, 0.8961, 0.1799]
PeriodicSite: Cs7 (Cs) (4.894, 3.778, 8.458) [0.75, 0.3961, 0.3201]
PeriodicSite: Cs8 (Cs) (1.631, 1.797, 6.368) [0.25, 0.1884, 0.241]

In [14]:
merged_df = DensityFeatures().featurize_dataframe(merged_df, col_id="structure", ignore_errors=True)
merged_df

Unnamed: 0,formula,id,band_gap,formation_energy,energy_above_hull,file_name,file_path,structure,density,vpa,packing fraction
0,Cs3Nd(MoO4)3,mp-573369,3.7419,-2.414818,0.000000,cif_mp-573369_Cs3Nd(MoO4)3.cif,./material_projects_api_cif_1000/cif_mp-573369...,"[[ 1.63145925 0.23570193 10.77716588] Cs, [1....",4.131559,21.635156,0.721428
1,Ba(PdS2)2,mp-28967,0.7792,-1.124079,0.000000,cif_mp-28967_Ba(PdS2)2.cif,./material_projects_api_cif_1000/cif_mp-28967_...,"[[4.55327196 5.11963834 2.01929037] Ba, [1.798...",5.022717,22.595815,0.514463
2,Na7Ta3Zn(SiO4)6,mp-2712904,2.5286,-2.912354,0.049678,cif_mp-2712904_Na7Ta3Zn(SiO4)6.cif,./material_projects_api_cif_1000/cif_mp-271290...,"[[5.74553181 0.18487825 5.49029458] Na, [1.846...",4.084346,13.105962,0.511377
3,TiZnBiO5,mp-1042447,0.0000,-2.236384,0.077777,cif_mp-1042447_TiZnBiO5.cif,./material_projects_api_cif_1000/cif_mp-104244...,"[[4.62933005 3.74678864 6.11501235] Ti, [3.152...",6.095516,13.697723,0.396792
4,Na2YNb3Si4(SO12)2,mp-2715697,1.8105,-2.837049,0.079832,cif_mp-2715697_Na2YNb3Si4(SO12)2.cif,./material_projects_api_cif_1000/cif_mp-271569...,"[[ 4.45385419 7.65516804 11.47379283] Na, [ 4...",2.984554,15.054064,0.302596
...,...,...,...,...,...,...,...,...,...,...,...
995,NaHfMg14,mp-1026645,0.0000,0.123330,0.123330,cif_mp-1026645_NaHfMg14.cif,./material_projects_api_cif_1000/cif_mp-102664...,"[[-0.00959284 1.87776076 1.28696084] Na, [-1...",2.417495,23.257461,0.639440
996,CeGa3Ni,mp-1226580,0.0000,-0.499538,0.035687,cif_mp-1226580_CeGa3Ni.cif,./material_projects_api_cif_1000/cif_mp-122658...,"[[0. 0. 0.] Ce, [ 2.08237107 2.39787734 -5.56...",7.521893,18.013128,0.715436
997,ReRuCl2,mp-631417,0.0000,1.034857,1.818772,cif_mp-631417_ReRuCl2.cif,./material_projects_api_cif_1000/cif_mp-631417...,"[[0. 0. 0.] Re, [2.5076145 1.77315122 4.34331...",10.266085,14.484023,0.481329
998,Li2NiH8(CO5)2,mp-771916,4.1397,-1.559949,0.052192,cif_mp-771916_Li2NiH8(CO5)2.cif,./material_projects_api_cif_1000/cif_mp-771916...,"[[ 1.63225055 4.37071098 11.62767295] Li, [4....",2.198468,8.691208,0.241578


In [15]:
merged_df[["formula","structure", "density", "vpa", "packing fraction"]]

Unnamed: 0,formula,structure,density,vpa,packing fraction
0,Cs3Nd(MoO4)3,"[[ 1.63145925 0.23570193 10.77716588] Cs, [1....",4.131559,21.635156,0.721428
1,Ba(PdS2)2,"[[4.55327196 5.11963834 2.01929037] Ba, [1.798...",5.022717,22.595815,0.514463
2,Na7Ta3Zn(SiO4)6,"[[5.74553181 0.18487825 5.49029458] Na, [1.846...",4.084346,13.105962,0.511377
3,TiZnBiO5,"[[4.62933005 3.74678864 6.11501235] Ti, [3.152...",6.095516,13.697723,0.396792
4,Na2YNb3Si4(SO12)2,"[[ 4.45385419 7.65516804 11.47379283] Na, [ 4...",2.984554,15.054064,0.302596
...,...,...,...,...,...
995,NaHfMg14,"[[-0.00959284 1.87776076 1.28696084] Na, [-1...",2.417495,23.257461,0.639440
996,CeGa3Ni,"[[0. 0. 0.] Ce, [ 2.08237107 2.39787734 -5.56...",7.521893,18.013128,0.715436
997,ReRuCl2,"[[0. 0. 0.] Re, [2.5076145 1.77315122 4.34331...",10.266085,14.484023,0.481329
998,Li2NiH8(CO5)2,"[[ 1.63225055 4.37071098 11.62767295] Li, [4....",2.198468,8.691208,0.241578


## 実行環境のライブラリverを保存

In [16]:
# Pythonのverを確認
!python3 -V

Python 3.11.11


In [17]:
!pip freeze > requirements_feature_generation_cif.txt

In [18]:
from google.colab import files

files.download('requirements_feature_generation_cif.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>