In [1]:
# 依旧先导包
# 老规矩，先导包和中文解码问题
import numpy as np
import pandas as pd
import seaborn as sns

# 解决中文显示问题，下面的代码只需运行一次即可
import matplotlib as plt

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
import os
# 查看当前文件夹下的所有文件
list_dir = os.listdir()
list_dir

['.git',
 '.idea',
 'analyse_dataset.ipynb',
 'dataset',
 'model',
 'models.py',
 'README.md',
 'requirements.txt',
 'train.py',
 'utils.py',
 '__pycache__',
 '研究日记.md']

In [2]:
# 色散的网络数据集
disper_comsol_df = pd.read_excel('./dataset/色散comsol+网络.xlsx')
disper_comsol_df = disper_comsol_df.iloc[:8793]
# 有效折射率差值的网络数据集
diff_neff_comsol_df = pd.read_excel('./dataset/Δneff_comsol+网络.xlsx')
diff_neff_comsol_df = diff_neff_comsol_df.iloc[:11545]

In [5]:
# 1、加载csv文件数据集文件
data = pd.read_excel('./dataset/Δneff_comsol+网络.xlsx')
# 只使用前11545行数据，因为前面的数据是真的仿真数据，11545行后的数据是神经网络生成的数据
data = data.iloc[:11545, :-1]
# 输出数据集的形状和前五行数据
# print(f"data.shape:{data.shape}\n")   # (2000, 21)
# print(f"data.head():{data.head()}\n")

# 2、获取x特征列和y标签列
x = data.iloc[:, :6]
y = data.iloc[:, 6:]
# 对y中所有的列的值都乘以10000倍
y = y * 10000
x

Unnamed: 0,n1,n2,r8,r9,r10,wl
0,0.047,-0.010,3.00,6.00,7.50,1.50
1,0.047,-0.010,3.00,6.00,7.50,1.52
2,0.047,-0.010,3.00,6.00,7.50,1.54
3,0.047,-0.010,3.00,6.00,7.50,1.56
4,0.047,-0.010,3.00,6.00,7.50,1.58
...,...,...,...,...,...,...
11540,0.050,-0.030,5.05,7.00,7.92,1.56
11541,0.060,-0.040,4.92,7.03,8.01,1.54
11542,0.060,-0.040,4.92,7.03,8.01,1.55
11543,0.060,-0.040,4.92,7.03,8.01,1.56


In [10]:
# 1、加载csv文件数据集文件
data = pd.read_excel('./dataset/Δneff_comsol+网络.xlsx')
# 只使用前11545行数据，因为前面的数据是真的仿真数据，11545行后的数据是神经网络生成的数据
data = data.iloc[:11545, :-1]

# 2、获取x特征列和y标签列同时把特征列转成浮点型，一定要转成浮点型，因为在神经网络中，计算都是浮点型   
# x：6个光学特性（作为输入）  y：6个结构参数（作为输出）
x = data.iloc[:, :6].astype(np.float32)  # 光学结构参数即作为输入，也作为输出
# 对x的前两列乘以100倍
x.iloc[:, :2] = x.iloc[:, :2] * 100
y = data.iloc[:, 6:].astype(np.float32)  # 光学特性参数作为条件输入


x = x*10000
y = y*10000
y

Unnamed: 0,TE01-HE21,HE21-TM01,HE31-EH11,HE41-EH21,HE51-EH31,HE61-EH41
0,47972.121094,58859.847656,11848.740234,4911.460449,2931.800537,13842.285156
1,49441.925781,60756.968750,12288.101562,5034.927734,3196.682617,14652.287109
2,50925.968750,62675.152344,12732.131836,5154.427734,3479.208984,15498.324219
3,52423.761719,64613.691406,13180.486328,5269.503418,3780.067627,16380.730469
4,53934.789062,66571.859375,13632.813477,5379.686523,4099.952148,17299.818359
...,...,...,...,...,...,...
11540,107001.640625,209744.875000,117892.000000,76398.101562,51500.277344,33525.785156
11541,102295.781250,194120.250000,105715.554688,67934.523438,45829.972656,29713.513672
11542,103827.148438,197407.843750,107715.085938,69269.921875,46759.417969,30304.757812
11543,105363.484375,200704.062500,109718.648438,70607.320312,47688.890625,30893.144531


In [9]:
# 色散数据集的最大最小值
disper_comsol_df.describe().loc[['count','min','max']]

Unnamed: 0,n1,n2,r8,r9,r10,wl,HE11,HE21,TE01,TM01,HE31,EH11,HE41,EH21,HE51,EH31,HE61,EH41,8793(含)前为comsol
count,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,8793.0,0.0
min,0.018,-0.05,2.709,5.7,7.0,1.51,-47.864918,-49.7978,-47.073422,-115.965649,-51.621914,-115.537541,-53.496668,-114.09636,-56.356086,-115.267863,-62.296721,-115.907453,
max,0.09,-0.01,8.1,14.0,16.0,1.59,51.093308,55.906172,51.82943,62.764967,68.885087,71.38496,89.993278,90.689616,118.444217,117.836674,153.609636,148.524927,


In [40]:
# 先计算两个差值
diff_r3_r2 = disper_comsol_df['r10'] - disper_comsol_df['r9']
diff_r2_r1 = disper_comsol_df['r9'] - disper_comsol_df['r8']

# 合并为新DataFrame（指定列名，更易读）
diff_df = pd.DataFrame({
    'diff_r3_r2': diff_r3_r2,  # 列名1：r10 - r9
    'diff_r2_r1': diff_r2_r1   # 列名2：r9 - r8
})

diff_df.describe()

Unnamed: 0,diff_r3_r2,diff_r2_r1
count,8793.0,8793.0
mean,1.084347,2.482746
std,0.198466,1.067835
min,0.6,1.3
25%,1.0,1.8
50%,1.0,2.3
75%,1.1,2.95
max,4.5,8.8


In [38]:
diff_neff_comsol_df.describe().loc[['count','min','max']]

Unnamed: 0,n1,n2,r8,r9,r10,wl,TE01-HE21,HE21-TM01,HE31-EH11,HE41-EH21,HE51-EH31,HE61-EH41,11545(含)前为comsol
count,11545.0,11545.0,11545.0,11545.0,11545.0,11545.0,11545.0,11545.0,11545.0,11545.0,11545.0,11545.0,0.0
min,0.018,-0.05,2.709,5.7,7.0,1.5,0.257276,0.274097,0.015969,0.003171,0.002987,0.016602,
max,0.09,-0.01,8.1,14.0,16.0,1.6,13.220317,66.230488,56.617398,47.784099,41.014492,33.24719,


In [41]:
# 先计算两个差值
diff_r3_r2 = diff_neff_comsol_df['r10'] - diff_neff_comsol_df['r9']
diff_r2_r1 = diff_neff_comsol_df['r9'] - diff_neff_comsol_df['r8']

# 合并为新DataFrame（指定列名，更易读）
diff_df = pd.DataFrame({
    'diff_r3_r2': diff_r3_r2,  # 列名1：r10 - r9
    'diff_r2_r1': diff_r2_r1   # 列名2：r9 - r8
})

diff_df.describe()

Unnamed: 0,diff_r3_r2,diff_r2_r1
count,11545.0,11545.0
mean,1.072984,2.459582
std,0.161427,1.031975
min,0.0,1.4
25%,1.0,1.8
50%,1.0,2.3
75%,1.1,2.95
max,2.127,8.8


In [47]:
for column in disper_comsol_df.columns:
    val_counts_df = disper_comsol_df[column].value_counts(dropna=False).reset_index()
    val_counts_df.columns = ['元素', '出现次数']
    val_counts_df = val_counts_df.sort_values(by='元素', ascending=True, na_position='last')
    print(f'列{column}的统计：')
    print(val_counts_df.head(20))

列n1的统计：
        元素  出现次数
36  0.0180     9
53  0.0200     3
1   0.0250   726
2   0.0300   702
11  0.0350   481
35  0.0380    12
34  0.0390    12
8   0.0400   567
49  0.0410     3
54  0.0420     3
13  0.0440   309
39  0.0450     6
12  0.0470   386
38  0.0471     6
14  0.0480    78
19  0.0490    36
10  0.0500   488
15  0.0510    63
17  0.0520    48
20  0.0530    33
列n2的统计：
        元素  出现次数
41 -0.0500     3
9  -0.0490   135
42 -0.0485     3
7  -0.0470   620
19 -0.0460    24
5  -0.0450   878
32 -0.0440     9
11 -0.0430    60
20 -0.0420    21
37 -0.0410     3
6  -0.0400   848
34 -0.0390     9
15 -0.0380    36
28 -0.0370    15
27 -0.0360    15
4  -0.0350   913
36 -0.0340     6
18 -0.0330    27
26 -0.0320    15
12 -0.0310    57
列r8的统计：
        元素  出现次数
193  2.709     3
187  2.747     3
191  2.782     3
196  2.829     3
197  2.844     3
195  2.858     3
190  2.928     3
75   2.932     6
185  2.947     3
168  2.950     3
3    3.000   487
198  3.001     3
74   3.004     6
186  3.010     3
165  3.