## Khai báo thư viện sử dụng

In [9]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import NMF

## Ví dụ

In [12]:
data = np.array([
    [0.6, 0.5, 0.5, 0.2, 0.3, 0.4],  # Khoáng vật 1
    [0.4, 0.2, 0.4, 0.5, 0.8, 0.7],  # Khoáng vật 2
    [0.9, 0.4, 0.8, 0.1, 0.0, 0.6],  # Khoáng vật 3
    [0.4, 0.3, 0.3, 0.8, 0.7, 0.7],  # Khoáng vật 4
    [0.8, 0.2, 0.6, 0.2, 0.2, 0.3],  # Khoáng vật 5
])

# Tạo đối tượng NMF, với r = 2 và random_state cố định để đảm bảo kết quả giống nhau
r=2
nmf = NMF(n_components=r, init='random', random_state=42)

# Phân tách ma trận dữ liệu thành W và H
W = nmf.fit_transform(data)  # Kích thước W là (5, 2)
H = nmf.components_  # Kích thước H là (2, 6)

# Làm tròn kết quả đến 3 chữ số thập phân
W = np.round(W, 3)
H = np.round(H, 3)

# In ra kết quả
print("Ma trận W (Cơ sở):")
print(W)
print("\nMa trận H (Hệ số):")
print(H)

# Chuyển đổi kết quả thành DataFrame để dễ hiểu
df_W = pd.DataFrame(W, columns=['r1', 'r2'])
df_H = pd.DataFrame(H, columns=['Mg', 'Ca', 'Fe', 'O2', 'N2', 'H2' ], index=['r1', 'r2'])

print("\nMa trận W (Cơ sở) dưới dạng bảng:")
print(df_W)

print("\nMa trận H (Hệ số) dưới dạng bảng:")
print(df_H)

# Dự đoán lại ma trận V
reconstructed_data = np.dot(W, H)
reconstructed_data = np.round(reconstructed_data, 3)

print("\nDự đoán lại ma trận V:")
print(reconstructed_data)

Ma trận W (Cơ sở):
[[0.409 0.777]
 [1.156 0.258]
 [0.115 1.316]
 [1.288 0.178]
 [0.234 0.931]]

Ma trận H (Hệ số):
[[0.204 0.169 0.177 0.53  0.618 0.501]
 [0.704 0.303 0.588 0.031 0.    0.326]]

Ma trận W (Cơ sở) dưới dạng bảng:
      r1     r2
0  0.409  0.777
1  1.156  0.258
2  0.115  1.316
3  1.288  0.178
4  0.234  0.931

Ma trận H (Hệ số) dưới dạng bảng:
       Mg     Ca     Fe     O2     N2     H2
r1  0.204  0.169  0.177  0.530  0.618  0.501
r2  0.704  0.303  0.588  0.031  0.000  0.326

Dự đoán lại ma trận V:
[[0.63  0.305 0.529 0.241 0.253 0.458]
 [0.417 0.274 0.356 0.621 0.714 0.663]
 [0.95  0.418 0.794 0.102 0.071 0.487]
 [0.388 0.272 0.333 0.688 0.796 0.703]
 [0.703 0.322 0.589 0.153 0.145 0.421]]


## Đọc bảng dữ liệu

In [15]:
Minerals_Database = pd.read_csv('Minerals_Database.csv')

Minerals_Database_filter = Minerals_Database[['Name', 'Magnesium', 'Calcium', 'Iron','Molar Mass']]
Minerals_Database_use = Minerals_Database_filter[(Minerals_Database_filter['Magnesium'] > 0) 
                                                | (Minerals_Database_filter['Calcium'] > 0) 
                                                | (Minerals_Database_filter['Iron'] > 0)]
Minerals_Database_sort = Minerals_Database_use.sort_values(by='Magnesium', ascending=False)

display(Minerals_Database_sort)

Unnamed: 0,Name,Magnesium,Calcium,Iron,Molar Mass
873,Antigorite-T,48.0,0.0,0.0,4473.458800
66,Antigorite,32.0,0.0,0.0,2870.558900
872,Antigorite-M,26.0,0.0,0.0,2305.079000
902,Arrojadite-(KNa),13.0,1.0,14.0,2754.630123
1603,Gottardiite,12.0,12.0,11.0,3305.874045
...,...,...,...,...,...
1796,Iwakiite,0.0,0.0,1.0,181.722500
1798,Jagoite,0.0,0.0,3.0,1109.015800
719,Tobermorite,0.0,3.0,0.0,349.493700
718,Titanite,0.0,1.0,0.0,196.040500


## Tính phần trăm 

In [18]:
Minerals_Database_sort = Minerals_Database_sort.copy()

Minerals_Database_sort['MgO_Percent'] = (40 * Minerals_Database_sort['Magnesium'] * 100) / Minerals_Database_sort['Molar Mass']
Minerals_Database_sort['CaO_Percent'] = (56 * Minerals_Database_sort['Calcium'] * 100) / Minerals_Database_sort['Molar Mass']
Minerals_Database_sort['FeO_Percent'] = (72 * Minerals_Database_sort['Iron'] * 100) / Minerals_Database_sort['Molar Mass']

Minerals_Database_sort_drop = Minerals_Database_sort.drop(columns=['Magnesium', 'Calcium', 'Iron', 'Molar Mass'])

display(Minerals_Database_sort_drop)

Unnamed: 0,Name,MgO_Percent,CaO_Percent,FeO_Percent
873,Antigorite-T,42.919810,0.000000,0.000000
66,Antigorite,44.590620,0.000000,0.000000
872,Antigorite-M,45.117760,0.000000,0.000000
902,Arrojadite-(KNa),18.877308,2.032941,36.592935
1603,Gottardiite,14.519609,20.327453,23.957356
...,...,...,...,...
1796,Iwakiite,0.000000,0.000000,39.620850
1798,Jagoite,0.000000,0.000000,19.476729
719,Tobermorite,0.000000,48.069536,0.000000
718,Titanite,0.000000,28.565526,0.000000


## Cách 1: giảm chiều bằng NMF

In [21]:
if "Name" in Minerals_Database_sort_drop.columns:
    Minerals_Database_sort_drop = Minerals_Database_sort_drop.drop(columns=["Name"])
#Minerals_Database_sort_drop.to_csv('1.csv')
V = Minerals_Database_sort_drop.values

nmf2 = NMF(n_components=2, init='random', random_state=40, max_iter=10000, tol=1e-6)

W2 = nmf2.fit_transform(V) 
H2 = nmf2.components_  

print("Ma trận cơ sở W (r = 2) :")
display(pd.DataFrame(W2, columns=[f"Dimension {i+1}" for i in range(2)]))
print("Ma trận trọng số H (r = 2):")
display(pd.DataFrame(H2, columns=Minerals_Database_sort_drop.columns))

reconstructed2 = np.dot(W2, H2)
reconstructed_rounded2 = np.round(reconstructed2, 5)
print("Dữ liệu tái tạo (reconstructed2):")
display(pd.DataFrame(reconstructed_rounded2, columns=Minerals_Database_sort_drop.columns))


nmf3 = NMF(n_components=3, init='random', random_state=40, max_iter=10000, tol=1e-6)

W3 = nmf3.fit_transform(V) 
H3 = nmf3.components_       

print("Ma trận cơ sở W (r = 3) :")
display(pd.DataFrame(W3, columns=[f"Dimension {i+1}" for i in range(3)]))
print("Ma trận trọng số H (r = 3):")
display(pd.DataFrame(H3, columns=Minerals_Database_sort_drop.columns))

reconstructed3 = np.dot(W3, H3)
reconstructed_rounded3 = np.round(reconstructed3, 5)
print("Dữ liệu tái tạo (reconstructed3):")
display(pd.DataFrame(reconstructed3, columns=Minerals_Database_sort_drop.columns))

Ma trận cơ sở W (r = 2) :


Unnamed: 0,Dimension 1,Dimension 2
0,4.019864,0.533917
1,4.176352,0.554701
2,4.225724,0.561259
3,16.167219,0.473859
4,10.650298,3.588072
...,...,...
1809,15.601531,0.000000
1810,7.669365,0.000000
1811,0.000000,8.223752
1812,0.000000,4.886999


Ma trận trọng số H (r = 2):


Unnamed: 0,MgO_Percent,CaO_Percent,FeO_Percent
0,0.574546,0.0,2.402127
1,0.446563,5.81089,0.0


Dữ liệu tái tạo (reconstructed2):


Unnamed: 0,MgO_Percent,CaO_Percent,FeO_Percent
0,2.54803,3.10253,9.65623
1,2.64722,3.22331,10.03213
2,2.67851,3.26141,10.15073
3,9.50042,2.75354,38.83572
4,7.72139,20.84989,25.58337
...,...,...,...
1809,8.96380,0.00000,37.47687
1810,4.40641,0.00000,18.42279
1811,3.67242,47.78731,0.00000
1812,2.18235,28.39781,0.00000


Ma trận cơ sở W (r = 3) :


Unnamed: 0,Dimension 1,Dimension 2,Dimension 3
0,0.000000,0.000000,5.519338
1,0.000000,0.000000,5.734199
2,0.000000,0.000000,5.801987
3,16.813011,0.383027,2.427552
4,11.007460,3.829905,1.867168
...,...,...,...
1809,18.204218,0.000000,0.000000
1810,8.948789,0.000000,0.000000
1811,0.000000,9.056803,0.000000
1812,0.000000,5.382044,0.000000


Ma trận trọng số H (r = 3):


Unnamed: 0,MgO_Percent,CaO_Percent,FeO_Percent
0,6.019579e-07,0.0,2.176465
1,2.078777e-07,5.307561,0.0
2,7.776262,4.125969e-11,0.0


Dữ liệu tái tạo (reconstructed3):


Unnamed: 0,MgO_Percent,CaO_Percent,FeO_Percent
0,4.291982e+01,2.277262e-10,0.000000
1,4.459063e+01,2.365913e-10,0.000000
2,4.511777e+01,2.393882e-10,0.000000
3,1.887729e+01,2.032941e+00,36.592935
4,1.451960e+01,2.032745e+01,23.957356
...,...,...,...
1809,1.095817e-05,0.000000e+00,39.620850
1810,5.386794e-06,0.000000e+00,19.476729
1811,1.882708e-06,4.806954e+01,0.000000
1812,1.118807e-06,2.856553e+01,0.000000


## Cách 2 

In [24]:
Minerals_Database_sort_drop['Name'] = Minerals_Database_use['Name']
Minerals_Database_percent = Minerals_Database_sort_drop.copy()
Minerals_Database_percent['CO2'] = ( Minerals_Database_sort_drop['MgO_Percent'] * 1.092 
                                + Minerals_Database_sort_drop['CaO_Percent'] * 0.785 
                                + Minerals_Database_sort_drop['FeO_Percent'] * 0.859)

Minerals_Database_percent_sort = Minerals_Database_percent[['Name','MgO_Percent','CaO_Percent','FeO_Percent', 'CO2']].head(10)
display(Minerals_Database_percent_sort)

Unnamed: 0,Name,MgO_Percent,CaO_Percent,FeO_Percent,CO2
873,Antigorite-T,42.91981,0.0,0.0,46.868432
66,Antigorite,44.59062,0.0,0.0,48.692957
872,Antigorite-M,45.11776,0.0,0.0,49.268593
902,Arrojadite-(KNa),18.877308,2.032941,36.592935,53.643209
1603,Gottardiite,14.519609,20.327453,23.957356,52.391833
2796,Stornesite-(Y),21.757246,6.092029,35.246739,58.818105
1461,Filipstadite,28.311789,0.0,25.48061,52.804318
934,Bannisterite,11.041868,1.545862,19.875362,30.344157
2838,Takeuchiite,25.056912,0.0,0.0,27.362148
1867,Khmaralite,12.554887,0.0,47.708572,54.6916


In [26]:
Name = ['Antigorite-T', 'Antigorite', 'Arrojadite-(KNa)', 'Gottardiite', 'Khmaralite']

Minerals_Database_sort = Minerals_Database_percent_sort[Minerals_Database_percent_sort['Name'].isin(Name)]

Minerals_Database_sort['Name'] = pd.Categorical(Minerals_Database_sort['Name'], categories=Name, ordered=True)
Minerals_Database_sort2 = Minerals_Database_sort.sort_values(by='Name')

display(Minerals_Database_sort2)

Unnamed: 0,Name,MgO_Percent,CaO_Percent,FeO_Percent,CO2
873,Antigorite-T,42.91981,0.0,0.0,46.868432
66,Antigorite,44.59062,0.0,0.0,48.692957
902,Arrojadite-(KNa),18.877308,2.032941,36.592935,53.643209
1603,Gottardiite,14.519609,20.327453,23.957356,52.391833
1867,Khmaralite,12.554887,0.0,47.708572,54.6916
