In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [6]:
# 设置显示中文
plt.rcParams['font.sans-serif'] = ['Heiti']
plt.rcParams['axes.unicode_minus'] = False

In [9]:
# 读取数据,根据身高和体重分出男女性别
data_path = './HeightWeight.csv'
data = pd.read_csv(data_path)
print('数据总量', data.shape)

X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=9)

In [26]:
# 构建模型
gmm = GaussianMixture(n_components=2, random_state=9, covariance_type='full')
gmm.fit(x_train)   # y 可以不输入

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=2, n_init=1, precisions_init=None,
                random_state=9, reg_covar=1e-06, tol=0.001, verbose=0,
                verbose_interval=10, warm_start=False, weights_init=None)

In [27]:
# 查看参数
print('均值：\n', gmm.means_)
print('方差：\n', gmm.covariances_)

均值：
 [[171.57828048  64.39643684]
 [160.33425543  54.98061253]]
方差：
 [[[30.72302234 13.61786099]
  [13.61786099 68.65372029]]

 [[19.01020213 10.1415953 ]
  [10.1415953  32.59087701]]]


In [40]:
# 预测结果，计算准确率
y_hat = gmm.predict(X)
y_test_hat = gmm.predict(x_test)

# 原始标签0表示女生，模型结果中国0表示男生
# 调换结果
y_hat = np.array([0 if i else 1 for i in y_hat])
y_test_hat = np.array([0 if i else 1 for i in y_test_hat])

In [43]:
# 计算准确率
acc_test = accuracy_score(y_test_hat, y_test)
acc = accuracy_score(y_hat, Y)
print(f'测试集的准确率{acc_test * 100:.2f}%')
print(f'全数据的准确率{acc * 100:.2f}%')

测试集的准确率86.96%
全数据的准确率82.46%
