## Các thư viện sử dụng

In [111]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

## Đọc bảng và lọc các dòng, cột cần dùng

In [114]:
#Đọc dữ liệu
Minerals_Database = pd.read_csv('Minerals_Database.csv')

#Lọc các cột cần dùng và các hàng có cả 3 giá trị Magnesium, Calcium, Iron bằng 0
Minerals_Database_filter = Minerals_Database[['Name', 'Magnesium', 'Calcium', 'Iron', 'Molar Mass']]
Minerals_Database_use = Minerals_Database_filter[
    ~((Minerals_Database_filter['Magnesium'] == 0) & 
      (Minerals_Database_filter['Calcium'] == 0) & 
      (Minerals_Database_filter['Iron'] == 0))]

# Hiển thị kết quả sau khi lọc
display(Minerals_Database_use)

Unnamed: 0,Name,Magnesium,Calcium,Iron,Molar Mass
4,Actinolite,4.0,1.0,4.0,861.185368
7,Adelite,1.0,1.0,0.0,251.283292
8,Admontite,2.0,0.0,0.0,407.639360
9,Aegirine,0.0,0.0,1.0,154.920468
10,Aenigmatite,4.0,1.0,7.0,1110.587536
...,...,...,...,...,...
3104,Zinnwaldite-2M1,0.0,0.0,3.0,699.777801
3106,Zirsinalite,0.0,1.0,0.0,269.364636
3107,Zoltaiite,1.0,0.0,2.0,677.090039
3110,Zwieselite,0.0,0.0,2.0,225.663765


## Tính thành phần phần trăm từng chất 

In [117]:
# Tạo bản sao độc lập 
Minerals_Database_use = Minerals_Database_use.copy()

# Thêm cột Percent_Ma, Percent_Ca, Percent_Ir
Minerals_Database_use['Percent_Ma'] = (24 * Minerals_Database_use['Magnesium'] * 100) / Minerals_Database_use['Molar Mass']
Minerals_Database_use['Percent_Ca'] = (40 * Minerals_Database_use['Calcium'] * 100) / Minerals_Database_use['Molar Mass']
Minerals_Database_use['Percent_Ir'] = (56 * Minerals_Database_use['Iron'] * 100) / Minerals_Database_use['Molar Mass']

# Loại bỏ các cột Magnesium, Calcium, Iron và Molar Mass
Minerals_Database_use = Minerals_Database_use.drop(columns=['Magnesium', 'Calcium', 'Iron', 'Molar Mass'])

# Hiển thị DataFrame sau khi xóa cột
display(Minerals_Database_use)

Unnamed: 0,Name,Percent_Ma,Percent_Ca,Percent_Ir
4,Actinolite,11.147426,4.644761,26.010660
7,Adelite,9.550973,15.918289,0.000000
8,Admontite,11.775114,0.000000,0.000000
9,Aegirine,0.000000,0.000000,36.147580
10,Aenigmatite,8.644073,3.601697,35.296632
...,...,...,...,...
3104,Zinnwaldite-2M1,0.000000,0.000000,24.007621
3106,Zirsinalite,0.000000,14.849759,0.000000
3107,Zoltaiite,3.544580,0.000000,16.541375
3110,Zwieselite,0.000000,0.000000,49.631362


## Giảm chiều của bảng bằng NMF

In [120]:
# Lọc cột "Name" 
if "Name" in Minerals_Database_use.columns:
    Minerals_Database_use = Minerals_Database_use.drop(columns=["Name"])
display(Minerals_Database_use)

# Chuyển dữ liệu thành NumPy array
V = Minerals_Database_use.values
# Khởi tạo NMF với số chiều giảm (k)
nmf = NMF(n_components=3, init='random', random_state=40)

# Thực hiện NMF
W = nmf.fit_transform(V)  # Ma trận cơ sở
H = nmf.components_       # Ma trận trọng số

print("Ma trận cơ sở W:")
display(pd.DataFrame(W, columns=[f"Dimension {i+1}" for i in range(3)]))
print("Ma trận trọng số H:")
display(pd.DataFrame(H, columns=Minerals_Database_use.columns))

reconstructed_data = np.dot(W, H)
reconstructed_data_rounded = np.round(reconstructed_data, 4)
print("Dữ liệu tái tạo (reconstructed data):")
display(pd.DataFrame(reconstructed_data_rounded, columns=Minerals_Database_use.columns))

Unnamed: 0,Percent_Ma,Percent_Ca,Percent_Ir
4,11.147426,4.644761,26.010660
7,9.550973,15.918289,0.000000
8,11.775114,0.000000,0.000000
9,0.000000,0.000000,36.147580
10,8.644073,3.601697,35.296632
...,...,...,...
3104,0.000000,0.000000,24.007621
3106,0.000000,14.849759,0.000000
3107,3.544580,0.000000,16.541375
3110,0.000000,0.000000,49.631362


Ma trận cơ sở W:


Unnamed: 0,Dimension 1,Dimension 2,Dimension 3
0,12.980340,1.030104,1.727337
1,0.000000,3.530326,1.480295
2,0.000000,0.000000,1.825062
3,18.039061,0.000000,0.000000
4,17.614405,0.798776,1.339185
...,...,...,...
1809,11.980745,0.000000,0.000000
1810,0.000000,3.293351,0.000000
1811,8.254795,0.000000,0.549115
1812,24.767997,0.000000,0.000000


Ma trận trọng số H:


Unnamed: 0,Percent_Ma,Percent_Ca,Percent_Ir
0,7.3e-05,0.0,2.00385
1,2.5e-05,4.509012,0.0
2,6.452147,7.144168e-07,0.0


Dữ liệu tái tạo (reconstructed data):


Unnamed: 0,Percent_Ma,Percent_Ca,Percent_Ir
0,11.1460,4.6448,26.0107
1,9.5512,15.9183,0.0000
2,11.7756,0.0000,0.0000
3,0.0013,0.0000,36.1476
4,8.6419,3.6017,35.2966
...,...,...,...
1809,0.0009,0.0000,24.0076
1810,0.0001,14.8498,0.0000
1811,3.5436,0.0000,16.5414
1812,0.0018,0.0000,49.6314
