## Các thư viện sử dụng

In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

## Đọc bảng và lọc các dòng, cột cần dùng

In [5]:
#Đọc dữ liệu
Minerals_Database = pd.read_csv('Minerals_Database.csv')

#Lọc các cột cần dùng và các hàng có cả 3 giá trị Magnesium, Calcium, Iron bằng 0
Minerals_Database_filter = Minerals_Database[['Name', 'Magnesium', 'Calcium', 'Iron', 'Molar Mass']]
Minerals_Database_use = Minerals_Database_filter[
    ~((Minerals_Database_filter['Magnesium'] == 0) & 
      (Minerals_Database_filter['Calcium'] == 0) & 
      (Minerals_Database_filter['Iron'] == 0))]

# Hiển thị kết quả sau khi lọc
display(Minerals_Database_use)

Unnamed: 0,Name,Magnesium,Calcium,Iron,Molar Mass
4,Actinolite,4.0,1.0,4.0,861.185368
7,Adelite,1.0,1.0,0.0,251.283292
8,Admontite,2.0,0.0,0.0,407.639360
9,Aegirine,0.0,0.0,1.0,154.920468
10,Aenigmatite,4.0,1.0,7.0,1110.587536
...,...,...,...,...,...
3104,Zinnwaldite-2M1,0.0,0.0,3.0,699.777801
3106,Zirsinalite,0.0,1.0,0.0,269.364636
3107,Zoltaiite,1.0,0.0,2.0,677.090039
3110,Zwieselite,0.0,0.0,2.0,225.663765


## Tính thành phần phần trăm từng chất 

In [7]:
# Tạo bản sao độc lập 
Minerals_Database_use = Minerals_Database_use.copy()

# Thêm cột Percent_Ma, Percent_Ca, Percent_Ir
Minerals_Database_use['Percent_Ma'] = (40 * Minerals_Database_use['Magnesium'] * 100) / Minerals_Database_use['Molar Mass']
Minerals_Database_use['Percent_Ca'] = (56 * Minerals_Database_use['Calcium'] * 100) / Minerals_Database_use['Molar Mass']
Minerals_Database_use['Percent_Ir'] = (72 * Minerals_Database_use['Iron'] * 100) / Minerals_Database_use['Molar Mass']

# Loại bỏ các cột Magnesium, Calcium, Iron và Molar Mass
Minerals_Database_use = Minerals_Database_use.drop(columns=['Magnesium', 'Calcium', 'Iron', 'Molar Mass'])

# Hiển thị DataFrame sau khi xóa cột
display(Minerals_Database_use)

Unnamed: 0,Name,Percent_Ma,Percent_Ca,Percent_Ir
4,Actinolite,18.579043,6.502665,33.442277
7,Adelite,15.918289,22.285604,0.000000
8,Admontite,19.625190,0.000000,0.000000
9,Aegirine,0.000000,0.000000,46.475460
10,Aenigmatite,14.406789,5.042376,45.381385
...,...,...,...,...
3104,Zinnwaldite-2M1,0.000000,0.000000,30.866941
3106,Zirsinalite,0.000000,20.789663,0.000000
3107,Zoltaiite,5.907634,0.000000,21.267482
3110,Zwieselite,0.000000,0.000000,63.811751


## Giảm chiều của bảng bằng NMF

In [9]:
# Lọc cột "Name" 
if "Name" in Minerals_Database_use.columns:
    Minerals_Database_use = Minerals_Database_use.drop(columns=["Name"])
display(Minerals_Database_use)

# Chuyển dữ liệu thành NumPy array
V = Minerals_Database_use.values
# Khởi tạo NMF với số chiều giảm (k)
nmf = NMF(n_components=3, init='random', random_state=40)

# Thực hiện NMF
W = nmf.fit_transform(V)  # Ma trận cơ sở
H = nmf.components_       # Ma trận trọng số

print("Ma trận cơ sở W:")
display(pd.DataFrame(W, columns=[f"Dimension {i+1}" for i in range(3)]))
print("Ma trận trọng số H:")
display(pd.DataFrame(H, columns=Minerals_Database_use.columns))

reconstructed_data = np.dot(W, H)
reconstructed_data_rounded = np.round(reconstructed_data, 4)
print("Dữ liệu tái tạo (reconstructed data):")
display(pd.DataFrame(reconstructed_data_rounded, columns=Minerals_Database_use.columns))

Unnamed: 0,Percent_Ma,Percent_Ca,Percent_Ir
4,18.579043,6.502665,33.442277
7,15.918289,22.285604,0.000000
8,19.625190,0.000000,0.000000
9,0.000000,0.000000,46.475460
10,14.406789,5.042376,45.381385
...,...,...,...
3104,0.000000,0.000000,30.866941
3106,0.000000,20.789663,0.000000
3107,5.907634,0.000000,21.267482
3110,0.000000,0.000000,63.811751


Ma trận cơ sở W:


Unnamed: 0,Dimension 1,Dimension 2,Dimension 3
0,15.422668,1.225292,2.372810
1,0.000000,4.199259,2.033192
2,0.000000,0.000000,2.506692
3,21.433217,0.000000,0.000000
4,20.928659,0.950131,1.839806
...,...,...,...
1809,14.234993,0.000000,0.000000
1810,0.000000,3.917380,0.000000
1811,9.807984,0.000000,0.754411
1812,29.428242,0.000000,0.000000


Ma trận trọng số H:


Unnamed: 0,Percent_Ma,Percent_Ca,Percent_Ir
0,4.4e-05,0.0,2.168385
1,1.5e-05,5.307032,0.0
2,7.829249,7.756732e-08,0.0


Dữ liệu tái tạo (reconstructed data):


Unnamed: 0,Percent_Ma,Percent_Ca,Percent_Ir
0,18.5780,6.5027,33.4423
1,15.9184,22.2856,0.0000
2,19.6255,0.0000,0.0000
3,0.0009,0.0000,46.4755
4,14.4052,5.0424,45.3814
...,...,...,...
1809,0.0006,0.0000,30.8669
1810,0.0001,20.7897,0.0000
1811,5.9069,0.0000,21.2675
1812,0.0013,0.0000,63.8118
