## 1 用[numpy](https://www.runoob.com/numpy/numpy-tutorial.html)数组表示数据

In [2]:
import numpy as np

# np.array接受一个列表或者元组，构成一个numpy数组
a = [1,2,3]
b = (0.1,0.2,0.3)
c = [1]
d = [a,b]
e = [d,d]
print(np.array(a))
print(np.array(b))
print(np.array(c))
print(np.array(d)) # 这是一个2*3的二维数组，也就是数学里的矩阵
print(np.array(e)) # 这是一个2*2*3的三维数组，以此类推可以创建更高维度的数组

[1 2 3]
[0.1 0.2 0.3]
[1]
[[1.  2.  3. ]
 [0.1 0.2 0.3]]
[[[1.  2.  3. ]
  [0.1 0.2 0.3]]

 [[1.  2.  3. ]
  [0.1 0.2 0.3]]]


## 2 读取原始光谱数据

### 用下面的方法读取txt文件

In [3]:
with open('data/0.txt', 'r') as f:
    data = f.readlines()
data # data是个列表，每个元素代表光谱上的一个(波数, 强度)坐标

['4000.000000,0.012245\n',
 '3997.379913,0.012245\n',
 '3994.759825,0.012245\n',
 '3992.139738,0.012245\n',
 '3989.519651,0.012245\n',
 '3986.899563,0.012245\n',
 '3984.279476,0.012245\n',
 '3981.659389,0.012245\n',
 '3979.039301,0.012245\n',
 '3976.419214,0.012245\n',
 '3973.799127,0.012245\n',
 '3971.179039,0.012245\n',
 '3968.558952,0.012245\n',
 '3965.938865,0.012245\n',
 '3963.318777,0.012245\n',
 '3960.698690,0.012245\n',
 '3958.078603,0.012245\n',
 '3955.458515,0.012245\n',
 '3952.838428,0.012245\n',
 '3950.218341,0.010204\n',
 '3947.598253,0.012245\n',
 '3944.978166,0.012245\n',
 '3942.358079,0.012245\n',
 '3939.737991,0.012245\n',
 '3937.117904,0.012245\n',
 '3934.497817,0.012245\n',
 '3931.877729,0.012245\n',
 '3929.257642,0.012245\n',
 '3926.637555,0.012245\n',
 '3924.017467,0.012245\n',
 '3921.397380,0.012245\n',
 '3918.777293,0.012245\n',
 '3916.157205,0.012245\n',
 '3913.537118,0.010204\n',
 '3910.917031,0.010204\n',
 '3908.296943,0.012245\n',
 '3905.676856,0.012245\n',
 

In [4]:
 # 用split方法切片得到波数或者强度值
wavenum = [i.split(',')[0] for i in data]
spec = [j.split(',')[1].split() for j in data]
wavenum = np.array(wavenum, dtype=np.float16)
spec = np.array(spec, dtype=np.float16)

### 从```data```文件夹下获取除了```inchi.txt```以外的所有文件名称，存储在列表里，并且按照编号从小到大顺序排列

In [5]:
import os
data_dir = os.path.join('.', 'data')
filelist = [x for x in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, x)) and x != 'inchi.txt']
filelist.sort(key=lambda x: int(os.path.splitext(x)[0]))
print(filelist)

['0.txt', '1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt', '11.txt', '12.txt', '13.txt', '14.txt', '15.txt', '16.txt', '17.txt', '18.txt', '19.txt', '20.txt', '21.txt', '22.txt', '23.txt', '24.txt', '25.txt', '26.txt', '27.txt', '28.txt', '29.txt', '30.txt', '31.txt', '32.txt', '33.txt', '34.txt', '35.txt', '36.txt', '37.txt', '38.txt', '39.txt', '40.txt', '41.txt', '42.txt', '43.txt', '44.txt', '45.txt', '46.txt', '47.txt', '48.txt', '49.txt', '50.txt', '51.txt', '52.txt', '53.txt', '54.txt', '55.txt', '56.txt', '57.txt', '58.txt', '59.txt', '60.txt', '61.txt', '62.txt', '63.txt', '64.txt', '65.txt', '66.txt', '67.txt', '68.txt', '69.txt', '70.txt', '71.txt', '72.txt', '73.txt', '74.txt', '75.txt', '76.txt', '77.txt', '78.txt', '79.txt', '80.txt', '81.txt', '82.txt', '83.txt', '84.txt', '85.txt', '86.txt', '87.txt', '88.txt', '89.txt', '90.txt', '91.txt', '92.txt', '93.txt', '94.txt', '95.txt', '96.txt', '97.txt', '98.txt', '99.txt', '100.txt'

### 读取这些文件里的光谱强度，存储为一个数组，每行代表一条光谱，形状为(光谱数量 * 光谱维度)

In [6]:
speclist = []
for file in filelist:
    with open(os.path.join(data_dir, file), 'r') as f:
        data = f.readlines()

    file_spec = []
        
    for line in data:
        if line.strip():
            spec_part = line.split(',')[1].split()
            file_spec.extend([np.float16(x) for x in spec_part])
    speclist.append(file_spec)
spec_array = np.array(speclist, dtype=np.float16)
print(spec_array)

[[0.012245 0.012245 0.012245 ...      nan      nan      nan]
 [0.01837  0.01837  0.01837  ...      nan      nan      nan]
 [0.0102   0.012245 0.012245 ...      nan      nan      nan]
 ...
 [0.0102   0.012245 0.012245 ...      nan      nan      nan]
 [0.0102   0.012245 0.012245 ...      nan      nan      nan]
 [0.012245 0.012245 0.012245 ...      nan      nan      nan]]


### 读取```data/inchi.txt```文件，每一行代表一个分子的InCHI（International Chemical Identifier，一种分子结构的表示方法）

In [7]:
inchis = []
inchi_dir = os.path.join('./data', 'inchi.txt')
with open('./data/inchi.txt', 'r') as f:
    inchis = [line.strip() for line in f if line.strip()]
print(inchis)

['InChI=1S/C4H7Cl3O2Si/c1-4(8)9-2-3-10(5,6)7/h2-3H2,1H3', 'InChI=1S/C6H13ClO2Si/c1-6(8)9-4-5-10(2,3)7/h4-5H2,1-3H3', 'InChI=1S/C5H10Cl2O2Si/c1-5(8)9-3-4-10(2,6)7/h3-4H2,1-2H3', 'InChI=1S/C6H12Cl2O2Si/c1-6(9)10-4-3-5-11(2,7)8/h3-5H2,1-2H3', 'InChI=1S/C8H18O5Si/c1-8(9)13-6-5-7-14(10-2,11-3)12-4/h5-7H2,1-4H3', 'InChI=1S/C15H31NO6Si/c1-5-15(18)19-13-14(17)12-16-10-9-11-23(20-6-2,21-7-3)22-8-4/h5,14,16-17H,1,6-13H2,2-4H3', 'InChI=1S/C13H30O4Si3/c1-9-13(14)15-11-10-12-20(8,16-18(2,3)4)17-19(5,6)7/h9H,1,10-12H2,2-8H3', 'InChI=1S/C7H12Cl2O2Si/c1-3-7(10)11-5-4-6-12(2,8)9/h3H,1,4-6H2,2H3', 'InChI=1S/C9H18O4Si/c1-5-9(10)13-7-6-8-14(4,11-2)12-3/h5H,1,6-8H2,2-4H3', 'InChI=1S/C6H9Cl3O2Si/c1-2-6(10)11-4-3-5-12(7,8)9/h2H,1,3-5H2', 'InChI=1S/C9H18O5Si/c1-5-9(10)14-7-6-8-15(11-2,12-3)13-4/h5H,1,6-8H2,2-4H3', 'InChI=1S/C12H19Cl3Si/c13-16(14,15)2-1-12-6-9-3-10(7-12)5-11(4-9)8-12/h9-11H,1-8H2/t9-,10+,11-,12+', 'InChI=1S/C5H11ClSi/c1-4-5-7(2,3)6/h4H,1,5H2,2-3H3', 'InChI=1S/C5H12Si/c1-4-5-6(2)3/h4,6H,1,5H2,2

## 3 使用[pandas](https://www.runoob.com/pandas/pandas-tutorial.html)对数据进行批量处理

In [8]:
import pandas as pd

### pandas.DataFrame是一个类似表格的类
    建立一个pandas.DataFrame实例，第一列称为```spectrum```，存储读取到的光谱强度；第二列称为```inchi```，对应每个物质的InChI：

In [9]:
df = pd.DataFrame(
    {'spectrum': list(spec_array),
     'inchi': inchis}
) # 每一行对应着一个分子的光谱和InChi（结构信息）

### 使用```drop```方法并规定一定的条件，可以根据某一列的值进行筛选，丢弃掉不符合条件的行
    丢弃掉没有InChi的物质所在的行；

In [10]:
df = df.drop(index=df[~df['inchi'].str.startswith('InChI=')].index)

### 使用```apply```方法可以以某一列作为输入，进行批量的操作
    使用```inchi```批量获得分子的SMILES，作为新的列```smiles```加入dataframe：

In [14]:
'''
定义一个把InChI转换为SMILES的函数：'''
from rdkit import Chem
from rdkit.Chem import inchi
def inchi2smiles(inchi):
    mol = Chem.MolFromInchi(inchi)
    if mol is not None:
        return Chem.MolToSmiles(mol)

df['smiles'] = df['inchi'].apply(inchi2smiles)

[11:18:45] Explicit valence for atom # 6 C, 6, is greater than permitted
[11:18:45] ERROR: Explicit valence for atom # 6 C, 6, is greater than permitted



In [15]:
df = df.drop(76)

In [16]:
def is_valid_smiles(smiles):
    if not isinstance(smiles, str):
        return False
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False
invalid_mask = ~df['smiles'].apply(is_valid_smiles)
invalid_smiles = df[invalid_mask]
print(invalid_smiles)

Empty DataFrame
Columns: [spectrum, inchi, smiles]
Index: []


### 继续使用```apply```，将光谱数据通过插值法批量缩小至1024维

In [17]:
'''
定义一个插值函数：'''
from scipy.interpolate import interp1d
import numpy as np
def interpolate_to_1024(spectrum):
    spectrum = np.asarray(spectrum, dtype=np.float32)
    orig_len = len(spectrum)
    if not np.all(np.isfinite(spectrum)):
        spectrum = np.nan_to_num(spectrum)
    if orig_len == 1:
        return np.full(1024, spectrum[0], dtype=np.float32)
    
    x_orig = np.linspace(0, 1, orig_len)
    x_new = np.linspace(0, 1, 1024)
    
    interp_fn = interp1d(x_orig, spectrum, kind='linear', bounds_error=False, fill_value=(spectrum[0], spectrum[-1]))
    return interp_fn(x_new).astype(np.float32)

df['spectrum'] = df['spectrum'].apply(interpolate_to_1024)

### 把这个dataframe存为```.pkl```文件，命名为```dataset.pkl```：

In [18]:
df.to_pickle("dataset.pkl") 