In [24]:
import pandas as pd

# 读取第一个 CSV 文件
file1 = './datasets/mushroom.csv'
df1 = pd.read_csv(file1)

# 读取第二个 CSV 文件
file2 = './datasets/mushrooms_add.csv'
df2 = pd.read_csv(file2)

# 获取列名
columns1 = df1.columns
columns2 = df2.columns

# 比较列名
if set(columns1) == set(columns2):
    print("The columns in both CSV files are the same.")
else:
    print("The columns in the CSV files are different.")

    # 找出不同的列
    columns_only_in_file1 = set(columns1) - set(columns2)
    columns_only_in_file2 = set(columns2) - set(columns1)

    if columns_only_in_file1:
        print(f"Columns only in {file1}: {columns_only_in_file1}")
    if columns_only_in_file2:
        print(f"Columns only in {file2}: {columns_only_in_file2}")
        
common_columns = columns1.intersection(columns2)

for column in common_columns:
        # 获取第一个文件中该列的唯一值集合
        unique_values1 = set(df1[column].dropna().unique())
        
        # 获取第二个文件中该列的唯一值集合
        unique_values2 = set(df2[column].dropna().unique())
        
         # 比较两个文件中该列的唯一值集合是否完全相等
        if unique_values1 == unique_values2:
            print(f"Column '{column}' has the same unique values in both files.")
            print(f"  {file1}: {unique_values1}")
        else:
            print(f"Column '{column}' has different unique values in the files:")
            print(f"  {file1}: {unique_values1}")
            print(f"  {file2}: {unique_values2}")

The columns in the CSV files are different.
Columns only in ./datasets/mushroom.csv: {'ruises'}
Columns only in ./datasets/mushrooms_add.csv: {'bruises'}
Column 'class' has the same unique values in both files.
  ./datasets/mushroom.csv: {'e', 'p'}
Column 'cap-shape' has the same unique values in both files.
  ./datasets/mushroom.csv: {'b', 's', 'x', 'f', 'c', 'k'}
Column 'cap-surface' has the same unique values in both files.
  ./datasets/mushroom.csv: {'y', 'g', 's', 'f'}
Column 'cap-color' has the same unique values in both files.
  ./datasets/mushroom.csv: {'y', 'n', 'u', 'b', 'e', 'p', 'w', 'r', 'g', 'c'}
Column 'odor' has the same unique values in both files.
  ./datasets/mushroom.csv: {'y', 'n', 's', 'm', 'l', 'p', 'f', 'a', 'c'}
Column 'gill-attachment' has the same unique values in both files.
  ./datasets/mushroom.csv: {'a', 'f'}
Column 'gill-spacing' has the same unique values in both files.
  ./datasets/mushroom.csv: {'c', 'w'}
Column 'gill-size' has the same unique values 

In [25]:
print(df1.columns)
print(df2.columns)
df2 = df2.rename(columns={'bruises': 'ruises'})
print(df2.columns)

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'ruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')
Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')
Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'ruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gi

In [26]:
mushrooms=df1
print(df1.shape)

(25986, 23)


In [27]:
mushrooms.replace('?', pd.NA, inplace=True)
df2.replace('?', pd.NA, inplace=True)
df1.isnull().sum()


class                          0
cap-shape                   3473
cap-surface                 3479
cap-color                   3459
ruises                      3472
odor                        3450
gill-attachment             3481
gill-spacing                3399
gill-size                   3492
gill-color                  3568
stalk-shape                 3474
stalk-root                  9584
stalk-surface-above-ring    3508
stalk-surface-below-ring    3423
stalk-color-above-ring      3573
stalk-color-below-ring      3433
veil-type                   3497
veil-color                  3503
ring-number                 3489
ring-type                   3508
spore-print-color           3493
population                  3511
habitat                     3484
dtype: int64

In [28]:
df2.isnull().sum()

class                          0
cap-shape                      0
cap-surface                    0
cap-color                      0
ruises                         0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [29]:
mushrooms['stalk-root'].value_counts()

stalk-root
b    8123
e    3555
c    2705
r    2019
Name: count, dtype: int64

In [31]:
# 利用众数填充缺失值
# mushrooms = mushrooms.fillna(mushrooms.mode().iloc[0])
# print(mushrooms.shape)

mode_class_0 = mushrooms[mushrooms['class'] == 'p'].mode().iloc[0]
mushrooms.loc[mushrooms['class'] == 'p'] = mushrooms.loc[mushrooms['class'] == 'p'].fillna(mode_class_0)

# 计算 class 值为 '1' 的众数

mode_class_1 = mushrooms[mushrooms['class'] == 'e'].mode().iloc[0]
mushrooms.loc[mushrooms['class'] == 'e'] = mushrooms.loc[mushrooms['class'] == 'e'].fillna(mode_class_1)

# 显示填充后的数据集
print(mushrooms.head())


  class cap-shape cap-surface cap-color ruises odor gill-attachment  \
0     e         x           f         n      f    n               f   
1     p         x           y         g      t    f               f   
2     e         b           y         n      t    n               f   
3     e         x           g         g      t    n               f   
4     e         x           f         n      t    n               a   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            w         n          b  ...                        y   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            w         b          n  ...                        s   
4            w         n          n  ...                        k   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      p         p          n   
1             

In [33]:

#df2也要填充
mode_class_0 = df2[df2['class'] == 'p'].mode().iloc[0]
df2.loc[df2['class'] == 'p'] = df2.loc[df2['class'] == 'p'].fillna(mode_class_0)

# 计算 class 值为 '1' 的众数

mode_class_1 = df2[df2['class'] == 'e'].mode().iloc[0]
df2.loc[df2['class'] == 'e'] = df2.loc[df2['class'] == 'e'].fillna(mode_class_1)

# 显示填充后的数据集
print(mushrooms.head())


  class cap-shape cap-surface cap-color ruises odor gill-attachment  \
0     e         x           f         n      f    n               f   
1     p         x           y         g      t    f               f   
2     e         b           y         n      t    n               f   
3     e         x           g         g      t    n               f   
4     e         x           f         n      t    n               a   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            w         n          b  ...                        y   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            w         b          n  ...                        s   
4            w         n          n  ...                        k   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      p         p          n   
1             

In [34]:

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

labelencoder=LabelEncoder()
onehotencoder = OneHotEncoder()

for col in df2.columns:
    df2[col] = labelencoder.fit_transform(df2[col])
    
for col in mushrooms.columns:
    mushrooms[col] = labelencoder.fit_transform(mushrooms[col])
    
X = mushrooms.drop('class', axis=1)
print(X.shape)
y = mushrooms['class']
X = onehotencoder.fit_transform(X).toarray()
print(X.shape)

X_add = df2.drop('class', axis=1)
print(X_add.shape)
y_add = df2['class']
X_add = onehotencoder.fit_transform(X_add).toarray()
print(X_add.shape)

(25986, 22)
(25986, 116)
(8124, 22)
(8124, 116)


In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# 将数据集拆分为训练集和测试集，按照8:2划分。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = np.vstack([X_train, X_add])
y_train = np.hstack([y_train, y_add])
print(X_train.shape)
# 创建 SVM 模型并进行训练
svm = SVC()
svm.fit(X_train, y_train)

joblib.dump(svm, './results/mushroom_svm_add_model.pkl')

(6499, 116)


['./results/mushroom_svm_add_model.pkl']

In [46]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


path = './results/mushroom_svm_add_model.pkl'
model = joblib.load(path)
# 预测并评估模型
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("Confusion matrix:\n", confusion)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
Confusion matrix:
 [[843   0]
 [  0 782]]


In [None]:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("Confusion matrix:\n", confusion)