## Khai báo thư viện sử dụng

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import NMF

## Ví dụ

In [11]:
# Dữ liệu đầu vào (ma trận khoáng sản)
data = np.array([[0.7, 0.6, 0.8],
                 [0.5, 0.4, 0.3],
                 [0.9, 0.7, 0.9],
                 [0.2, 0.3, 0.1]])

# Tạo đối tượng NMF, với r = 2 và random_state cố định để đảm bảo kết quả giống nhau
r=2
nmf = NMF(n_components=r, init='random', random_state=42)

# Phân tách ma trận dữ liệu thành W và H
W = nmf.fit_transform(data)  # Kích thước W là (4, 2)
H = nmf.components_  # Kích thước H là (2, 3)

# Làm tròn kết quả đến 3 chữ số thập phân
W = np.round(W, 3)
H = np.round(H, 3)

# In ra kết quả
print("Ma trận W (Cơ sở):")
print(W)
print("\nMa trận H (Hệ số):")
print(H)

# Chuyển đổi kết quả thành DataFrame để dễ hiểu
df_W = pd.DataFrame(W, columns=['r1', 'r2'])
df_H = pd.DataFrame(H, columns=['Mg', 'Ca', 'Fe'], index=['r1', 'r2'])

print("\nMa trận W (Cơ sở) dưới dạng bảng:")
print(df_W)

print("\nMa trận H (Hệ số) dưới dạng bảng:")
print(df_H)

# Dự đoán lại ma trận V
reconstructed_data = np.dot(W, H)  # Kết quả sẽ có kích thước (4, 3), giống như ma trận đầu vào
reconstructed_data = np.round(reconstructed_data, 3)

print("\nDự đoán lại ma trận V:")
print(reconstructed_data)

Ma trận W (Cơ sở):
[[2.314 0.186]
 [0.882 0.393]
 [2.641 0.3  ]
 [0.219 0.33 ]]

Ma trận H (Hệ số):
[[0.27  0.195 0.339]
 [0.565 0.664 0.035]]

Ma trận W (Cơ sở) dưới dạng bảng:
      r1     r2
0  2.314  0.186
1  0.882  0.393
2  2.641  0.300
3  0.219  0.330

Ma trận H (Hệ số) dưới dạng bảng:
       Mg     Ca     Fe
r1  0.270  0.195  0.339
r2  0.565  0.664  0.035

Dự đoán lại ma trận V:
[[0.73  0.575 0.791]
 [0.46  0.433 0.313]
 [0.883 0.714 0.906]
 [0.246 0.262 0.086]]


## Đọc bảng dữ liệu

In [14]:
Minerals_Database = pd.read_csv('Minerals_Database.csv')

Minerals_Database_filter = Minerals_Database[['Name', 'Magnesium', 'Calcium', 'Iron','Molar Mass']]
Minerals_Database_use = Minerals_Database_filter[(Minerals_Database_filter['Magnesium'] > 0) 
                                                | (Minerals_Database_filter['Calcium'] > 0) 
                                                | (Minerals_Database_filter['Iron'] > 0)]
Minerals_Database_sort = Minerals_Database_use.sort_values(by='Magnesium', ascending=False)

display(Minerals_Database_sort)

Unnamed: 0,Name,Magnesium,Calcium,Iron,Molar Mass
873,Antigorite-T,48.0,0.0,0.0,4473.458800
66,Antigorite,32.0,0.0,0.0,2870.558900
872,Antigorite-M,26.0,0.0,0.0,2305.079000
902,Arrojadite-(KNa),13.0,1.0,14.0,2754.630123
1603,Gottardiite,12.0,12.0,11.0,3305.874045
...,...,...,...,...,...
1796,Iwakiite,0.0,0.0,1.0,181.722500
1798,Jagoite,0.0,0.0,3.0,1109.015800
719,Tobermorite,0.0,3.0,0.0,349.493700
718,Titanite,0.0,1.0,0.0,196.040500


## Tính phần trăm 

In [17]:
Minerals_Database_sort = Minerals_Database_sort.copy()

Minerals_Database_sort['MgO_Percent'] = (40 * Minerals_Database_sort['Magnesium'] * 100) / Minerals_Database_sort['Molar Mass']
Minerals_Database_sort['CaO_Percent'] = (56 * Minerals_Database_sort['Calcium'] * 100) / Minerals_Database_sort['Molar Mass']
Minerals_Database_sort['FeO_Percent'] = (72 * Minerals_Database_sort['Iron'] * 100) / Minerals_Database_sort['Molar Mass']

Minerals_Database_sort_drop = Minerals_Database_sort.drop(columns=['Magnesium', 'Calcium', 'Iron', 'Molar Mass'])

display(Minerals_Database_sort_drop)

Unnamed: 0,Name,MgO_Percent,CaO_Percent,FeO_Percent
873,Antigorite-T,42.919810,0.000000,0.000000
66,Antigorite,44.590620,0.000000,0.000000
872,Antigorite-M,45.117760,0.000000,0.000000
902,Arrojadite-(KNa),18.877308,2.032941,36.592935
1603,Gottardiite,14.519609,20.327453,23.957356
...,...,...,...,...
1796,Iwakiite,0.000000,0.000000,39.620850
1798,Jagoite,0.000000,0.000000,19.476729
719,Tobermorite,0.000000,48.069536,0.000000
718,Titanite,0.000000,28.565526,0.000000


## Cách 1: giảm chiều bằng NMF

In [20]:
if "Name" in Minerals_Database_sort_drop.columns:
    Minerals_Database_sort_drop = Minerals_Database_sort_drop.drop(columns=["Name"])

V = Minerals_Database_sort_drop.values

nmf = NMF(n_components=3, init='random', random_state=40)

W = nmf.fit_transform(V) 
H = nmf.components_       

print("Ma trận cơ sở W:")
display(pd.DataFrame(W, columns=[f"Dimension {i+1}" for i in range(3)]))
print("Ma trận trọng số H:")
display(pd.DataFrame(H, columns=Minerals_Database_sort_drop.columns))

reconstructed = np.dot(W, H)
reconstructed_rounded = np.round(reconstructed, 5)
print("Dữ liệu tái tạo (reconstructed):")
display(pd.DataFrame(reconstructed_rounded, columns=Minerals_Database_sort_drop.columns))

Ma trận cơ sở W:


Unnamed: 0,Dimension 1,Dimension 2,Dimension 3
0,0.000000,0.000000,5.519475
1,0.000000,0.000000,5.734340
2,0.000000,0.000000,5.802130
3,16.813011,0.383027,2.427339
4,11.007460,3.829905,1.867016
...,...,...,...
1809,18.204218,0.000000,0.000000
1810,8.948789,0.000000,0.000000
1811,0.000000,9.056803,0.000000
1812,0.000000,5.382044,0.000000


Ma trận trọng số H:


Unnamed: 0,MgO_Percent,CaO_Percent,FeO_Percent
0,4.4e-05,0.0,2.176465
1,1.5e-05,5.307561,0.0
2,7.776196,7.504234e-08,0.0


Dữ liệu tái tạo (reconstructed):


Unnamed: 0,MgO_Percent,CaO_Percent,FeO_Percent
0,42.92052,0.00000,0.00000
1,44.59135,0.00000,0.00000
2,45.11850,0.00000,0.00000
3,18.87620,2.03294,36.59293
4,14.51882,20.32745,23.95736
...,...,...,...
1809,0.00079,0.00000,39.62085
1810,0.00039,0.00000,19.47673
1811,0.00014,48.06954,0.00000
1812,0.00008,28.56553,0.00000


## Cách 2 

In [23]:
Minerals_Database_sort_drop['Name'] = Minerals_Database_use['Name']
Minerals_Database_percent = Minerals_Database_sort_drop.copy()
Minerals_Database_percent['CO2'] = ( Minerals_Database_sort_drop['MgO_Percent'] * 1.092 
                                + Minerals_Database_sort_drop['CaO_Percent'] * 0.785 
                                + Minerals_Database_sort_drop['FeO_Percent'] * 0.859)

Minerals_Database_percent_sort = Minerals_Database_percent[['Name','MgO_Percent','CaO_Percent','FeO_Percent', 'CO2']].head(10)
display(Minerals_Database_percent_sort)

Unnamed: 0,Name,MgO_Percent,CaO_Percent,FeO_Percent,CO2
873,Antigorite-T,42.91981,0.0,0.0,46.868432
66,Antigorite,44.59062,0.0,0.0,48.692957
872,Antigorite-M,45.11776,0.0,0.0,49.268593
902,Arrojadite-(KNa),18.877308,2.032941,36.592935,53.643209
1603,Gottardiite,14.519609,20.327453,23.957356,52.391833
2796,Stornesite-(Y),21.757246,6.092029,35.246739,58.818105
1461,Filipstadite,28.311789,0.0,25.48061,52.804318
934,Bannisterite,11.041868,1.545862,19.875362,30.344157
2838,Takeuchiite,25.056912,0.0,0.0,27.362148
1867,Khmaralite,12.554887,0.0,47.708572,54.6916


In [25]:
Name = ['Antigorite-T', 'Antigorite', 'Arrojadite-(KNa)', 'Gottardiite', 'Khmaralite']

Minerals_Database_sort = Minerals_Database_percent_sort[Minerals_Database_percent_sort['Name'].isin(Name)]

Minerals_Database_sort['Name'] = pd.Categorical(Minerals_Database_sort['Name'], categories=Name, ordered=True)
Minerals_Database_sort2 = Minerals_Database_sort.sort_values(by='Name')

display(Minerals_Database_sort2)

Unnamed: 0,Name,MgO_Percent,CaO_Percent,FeO_Percent,CO2
873,Antigorite-T,42.91981,0.0,0.0,46.868432
66,Antigorite,44.59062,0.0,0.0,48.692957
902,Arrojadite-(KNa),18.877308,2.032941,36.592935,53.643209
1603,Gottardiite,14.519609,20.327453,23.957356,52.391833
1867,Khmaralite,12.554887,0.0,47.708572,54.6916
