In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [74]:
file_path = r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task4\introml_2024_task4_train.csv'

if os.path.exists(file_path):
    print("File exists")
    data = pd.read_csv(r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task4\introml_2024_task4_train.csv')
    print(data.head())
else:
    print("File does not exist")

File exists
   id   f0   f1   f2   f3   f4   f5   f6   f7      f8  ...     f10     f11  \
0   0  f01  f11  f21    ?  f41  f51  f61  f71  0.2261  ...  0.9735  0.5856   
1   1  f01  f10  f20  f30  f40    ?  f61  f71  0.3586  ...  0.8581  0.5175   
2   2  f01  f10  f21  f31  f40  f50    ?  f71  0.9501  ...  0.0511  0.3113   
3   3  f01    ?  f20  f30  f41  f50  f60  f71  0.8244  ...  0.8692  0.6662   
4   4  f01  f11  f21  f31  f41  f50  f61  f71  1.1654  ...  0.1525  0.8781   

      f12     f13     f14     f15   f16   f17   f18 class  
0  1.5732  4.7563  0.8080  0.8188  f160  f170  f181    C0  
1  3.2270  4.1908  0.2499  2.9982  f161  f170  f181    C0  
2  1.2130  5.0398  0.6183  4.2670  f161  f171  f181    C0  
3  1.2181  4.7104  0.3333  2.9876  f161  f170  f180    C0  
4  2.2129  1.5904  0.3565  1.0673  f160  f170  f180    C0  

[5 rows x 21 columns]


In [75]:
# 將'?'替換為NaN
data.replace('?', np.nan, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      4800 non-null   int64 
 1   f0      4658 non-null   object
 2   f1      4658 non-null   object
 3   f2      4634 non-null   object
 4   f3      4658 non-null   object
 5   f4      4654 non-null   object
 6   f5      4670 non-null   object
 7   f6      4660 non-null   object
 8   f7      4651 non-null   object
 9   f8      4662 non-null   object
 10  f9      4639 non-null   object
 11  f10     4639 non-null   object
 12  f11     4666 non-null   object
 13  f12     4657 non-null   object
 14  f13     4645 non-null   object
 15  f14     4637 non-null   object
 16  f15     4652 non-null   object
 17  f16     4800 non-null   object
 18  f17     4800 non-null   object
 19  f18     4800 non-null   object
 20  class   4800 non-null   object
dtypes: int64(1), object(20)
memory usage: 787.6+ KB


In [76]:
# 檢查資料缺失情況
missing_values = data.isnull().sum()
print(missing_values)

id         0
f0       142
f1       142
f2       166
f3       142
f4       146
f5       130
f6       140
f7       149
f8       138
f9       161
f10      161
f11      134
f12      143
f13      155
f14      163
f15      148
f16        0
f17        0
f18        0
class      0
dtype: int64


In [77]:
# Analyze the categorical columns (f0~f7 and f16~f18) for unique values and missing data

# List of categorical columns
categorical_columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f16', 'f17', 'f18']

# Display unique values and count of missing values for each categorical column
for col in categorical_columns:
    unique_values = data[col].unique()
    missing_count = data[col].isnull().sum()
    print(f"Column: {col}")
    print(f"Unique Values: {unique_values}")
    print(f"Missing Values: {missing_count}\n")

Column: f0
Unique Values: ['f01' 'f00' nan]
Missing Values: 142

Column: f1
Unique Values: ['f11' 'f10' nan]
Missing Values: 142

Column: f2
Unique Values: ['f21' 'f20' nan]
Missing Values: 166

Column: f3
Unique Values: [nan 'f30' 'f31']
Missing Values: 142

Column: f4
Unique Values: ['f41' 'f40' nan]
Missing Values: 146

Column: f5
Unique Values: ['f51' nan 'f50']
Missing Values: 130

Column: f6
Unique Values: ['f61' nan 'f60']
Missing Values: 140

Column: f7
Unique Values: ['f71' 'f70' nan]
Missing Values: 149

Column: f16
Unique Values: ['f160' 'f161']
Missing Values: 0

Column: f17
Unique Values: ['f170' 'f171']
Missing Values: 0

Column: f18
Unique Values: ['f181' 'f180']
Missing Values: 0



In [78]:
import matplotlib.pyplot as plt
# Analyze the numerical columns (f8~f15) for statistical properties and missing data

# List of numerical columns
numerical_columns = ['f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15']

# Display descriptive statistics for numerical columns
numerical_stats = data[numerical_columns].describe()
print("Descriptive Statistics for Numerical Columns:")
print(numerical_stats)

# Check for missing values in numerical columns
missing_values_numerical = data[numerical_columns].isnull().sum()
print("\nMissing Values in Numerical Columns:")
print(missing_values_numerical)


Descriptive Statistics for Numerical Columns:
            f8      f9     f10     f11     f12     f13     f14     f15
count     4662    4639    4639    4666    4657    4645    4637    4652
unique    4008    4122    3940    3889    4368    4345    3916    4544
top     1.1121  0.8001  1.1608  0.5778  1.5771  1.2898  0.5271  2.1275
freq         4       4       4       5       4       3       7       3

Missing Values in Numerical Columns:
f8     138
f9     161
f10    161
f11    134
f12    143
f13    155
f14    163
f15    148
dtype: int64


In [79]:
# Columns to fill with mean values
columns_to_fill = ['f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15']

# Fill missing values with column mean
for col in columns_to_fill:
    data[col] = data[col].astype(float)  # Ensure the column is of float type
    data[col].fillna(data[col].mean(), inplace=True)

# Verify that there are no missing values left in the specified columns
print(data[columns_to_fill].isnull().sum())

data

f8     0
f9     0
f10    0
f11    0
f12    0
f13    0
f14    0
f15    0
dtype: int64


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f10,f11,f12,f13,f14,f15,f16,f17,f18,class
0,0,f01,f11,f21,,f41,f51,f61,f71,0.2261,...,0.9735,0.5856,1.5732,4.7563,0.808000,0.818800,f160,f170,f181,C0
1,1,f01,f10,f20,f30,f40,,f61,f71,0.3586,...,0.8581,0.5175,3.2270,4.1908,0.249900,2.998200,f161,f170,f181,C0
2,2,f01,f10,f21,f31,f40,f50,,f71,0.9501,...,0.0511,0.3113,1.2130,5.0398,0.618300,4.267000,f161,f171,f181,C0
3,3,f01,,f20,f30,f41,f50,f60,f71,0.8244,...,0.8692,0.6662,1.2181,4.7104,0.333300,2.987600,f161,f170,f180,C0
4,4,f01,f11,f21,f31,f41,f50,f61,f71,1.1654,...,0.1525,0.8781,2.2129,1.5904,0.356500,1.067300,f160,f170,f180,C0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,4795,f01,f11,f20,f31,f40,f50,f61,f71,0.2228,...,0.2169,1.0913,2.5216,2.6287,0.789400,1.481700,f161,f170,f180,C5
4796,4796,,f10,f20,f30,f40,f51,f61,f71,0.1310,...,0.1186,0.1703,1.7939,0.6713,0.624647,4.641500,f161,f170,f181,C5
4797,4797,f01,f10,f20,f30,f40,f51,f60,f71,0.4483,...,0.3918,1.2872,1.1864,0.1411,0.146900,5.321200,f161,f171,f181,C5
4798,4798,f00,f10,f20,f30,f41,f51,f60,f71,0.4568,...,0.8263,0.5226,1.6286,3.2508,0.064900,1.299000,f160,f170,f180,C5


In [80]:
# Print the count of unique values for each categorical column
for col in categorical_columns:
    print(f"Column: {col}")
    print("Value Counts:")
    print(data[col].value_counts())
    print("\n")

Column: f0
Value Counts:
f01    4019
f00     639
Name: f0, dtype: int64


Column: f1
Value Counts:
f10    4112
f11     546
Name: f1, dtype: int64


Column: f2
Value Counts:
f20    2894
f21    1740
Name: f2, dtype: int64


Column: f3
Value Counts:
f31    2758
f30    1900
Name: f3, dtype: int64


Column: f4
Value Counts:
f40    3611
f41    1043
Name: f4, dtype: int64


Column: f5
Value Counts:
f50    2974
f51    1696
Name: f5, dtype: int64


Column: f6
Value Counts:
f61    2635
f60    2025
Name: f6, dtype: int64


Column: f7
Value Counts:
f71    3258
f70    1393
Name: f7, dtype: int64


Column: f16
Value Counts:
f161    2660
f160    2140
Name: f16, dtype: int64


Column: f17
Value Counts:
f170    2765
f171    2035
Name: f17, dtype: int64


Column: f18
Value Counts:
f181    2564
f180    2236
Name: f18, dtype: int64




In [81]:
# Categorical columns to check
categorical_columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f16', 'f17', 'f18']

# Print the count of unique values for each categorical column
for col in categorical_columns:
    print(f"Column: {col}")
    print(data[col].value_counts())
    print("\n")

Column: f0
f01    4019
f00     639
Name: f0, dtype: int64


Column: f1
f10    4112
f11     546
Name: f1, dtype: int64


Column: f2
f20    2894
f21    1740
Name: f2, dtype: int64


Column: f3
f31    2758
f30    1900
Name: f3, dtype: int64


Column: f4
f40    3611
f41    1043
Name: f4, dtype: int64


Column: f5
f50    2974
f51    1696
Name: f5, dtype: int64


Column: f6
f61    2635
f60    2025
Name: f6, dtype: int64


Column: f7
f71    3258
f70    1393
Name: f7, dtype: int64


Column: f16
f161    2660
f160    2140
Name: f16, dtype: int64


Column: f17
f170    2765
f171    2035
Name: f17, dtype: int64


Column: f18
f181    2564
f180    2236
Name: f18, dtype: int64




In [82]:
# Categorical columns to check
categorical_columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f16', 'f17', 'f18']

# Fill NaN values with the most frequent value in each column
for col in categorical_columns:
    most_frequent_value = data[col].mode()[0]
    data[col].fillna(most_frequent_value, inplace=True)

# Verify that there are no missing values left in the specified columns
print(data[categorical_columns].isnull().sum())

f0     0
f1     0
f2     0
f3     0
f4     0
f5     0
f6     0
f7     0
f16    0
f17    0
f18    0
dtype: int64


In [83]:
data.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f10,f11,f12,f13,f14,f15,f16,f17,f18,class
0,0,f01,f11,f21,f31,f41,f51,f61,f71,0.2261,...,0.9735,0.5856,1.5732,4.7563,0.808,0.8188,f160,f170,f181,C0
1,1,f01,f10,f20,f30,f40,f50,f61,f71,0.3586,...,0.8581,0.5175,3.227,4.1908,0.2499,2.9982,f161,f170,f181,C0
2,2,f01,f10,f21,f31,f40,f50,f61,f71,0.9501,...,0.0511,0.3113,1.213,5.0398,0.6183,4.267,f161,f171,f181,C0
3,3,f01,f10,f20,f30,f41,f50,f60,f71,0.8244,...,0.8692,0.6662,1.2181,4.7104,0.3333,2.9876,f161,f170,f180,C0
4,4,f01,f11,f21,f31,f41,f50,f61,f71,1.1654,...,0.1525,0.8781,2.2129,1.5904,0.3565,1.0673,f160,f170,f180,C0


In [84]:
# 進行頻率編碼
for col in categorical_columns:
    freq_encoding = data[col].value_counts() / len(data)
    data[col] = data[col].map(freq_encoding)



In [85]:
# 分割特徵和標籤
X = data.drop('class', axis=1)
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [86]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(random_state=42))
])
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [0.1, 0.01]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy}')
# 訓練模型
#model = RandomForestClassifier(random_state=42)
#model.fit(X_train, y_train)
# 訓練 SVC 模型
#model = SVC(random_state=42)
#model.fit(X_train, y_train)

# 進行預測
#y_pred = model.predict(X_test)

# 評估模型
#accuracy = accuracy_score(y_test, y_pred)
#print(f'Accuracy: {accuracy}')

Best Parameters: {'svc__C': 10, 'svc__gamma': 0.01}
Accuracy: 0.9583333333333334


In [87]:
file_path = r'C:\\Users\\nonohuang\\OneDrive\\桌面\\kaggle\\kaggle\\task4\\introml_2024_task4_test_NO_answers_shuffled.csv'
if os.path.exists(file_path):
    print("File exists")
    data = pd.read_csv(file_path)
    print(data.head())
else:
    print("File does not exist")
data.replace('?', np.nan, inplace=True)
data.info()
missing_values = data.isnull().sum()
print(missing_values)

File exists
   id   f0   f1   f2   f3   f4   f5   f6   f7      f8      f9     f10     f11  \
0   0    ?  f10  f21  f31  f41  f51  f60  f70  0.6502  1.0392  0.8372  0.4417   
1   1  f01  f10  f20  f30  f41  f50  f61  f71  0.4965  0.2219  0.8066  1.0555   
2   2  f01  f10  f21  f31  f40  f51  f60  f71  1.0281  0.6482  0.4687  0.3296   
3   3  f01  f10  f20  f30  f40  f51  f60  f70       ?  0.3092  1.2234  0.3484   
4   4  f01  f10  f20  f31  f40  f51  f61  f71  0.7464       ?  0.1788  1.1686   

      f12     f13     f14      f15   f16   f17   f18  
0  0.9828  5.0786  0.9674  10.5419  f160  f170  f180  
1  1.0774  3.0361  0.9684  10.7465  f161  f170  f180  
2  1.9474  1.5626   0.061   2.8468  f160  f170  f180  
3  2.5767  1.4449  0.9238  12.2682  f160  f170  f181  
4  1.0778  3.6329  1.0652   9.7108  f160  f170  f181  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  -------------- 

In [88]:
# 將'?'替換為NaN
data.replace('?', np.nan, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1200 non-null   int64 
 1   f0      1165 non-null   object
 2   f1      1157 non-null   object
 3   f2      1166 non-null   object
 4   f3      1170 non-null   object
 5   f4      1157 non-null   object
 6   f5      1164 non-null   object
 7   f6      1167 non-null   object
 8   f7      1158 non-null   object
 9   f8      1163 non-null   object
 10  f9      1168 non-null   object
 11  f10     1158 non-null   object
 12  f11     1153 non-null   object
 13  f12     1157 non-null   object
 14  f13     1171 non-null   object
 15  f14     1159 non-null   object
 16  f15     1168 non-null   object
 17  f16     1200 non-null   object
 18  f17     1200 non-null   object
 19  f18     1200 non-null   object
dtypes: int64(1), object(19)
memory usage: 187.6+ KB


In [89]:
# 檢查資料缺失情況
missing_values = data.isnull().sum()
print(missing_values)

id      0
f0     35
f1     43
f2     34
f3     30
f4     43
f5     36
f6     33
f7     42
f8     37
f9     32
f10    42
f11    47
f12    43
f13    29
f14    41
f15    32
f16     0
f17     0
f18     0
dtype: int64


In [90]:
import matplotlib.pyplot as plt
# Analyze the numerical columns (f0~f7) for statistical properties and missing data

# List of numerical columns
numerical_columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7']

# Display descriptive statistics for numerical columns
numerical_stats = data[numerical_columns].describe()
print("Descriptive Statistics for Numerical Columns:")
print(numerical_stats)

# Check for missing values in numerical columns
missing_values_numerical = data[numerical_columns].isnull().sum()
print("\nMissing Values in Numerical Columns:")
print(missing_values_numerical)


Descriptive Statistics for Numerical Columns:
          f0    f1    f2    f3    f4    f5    f6    f7
count   1165  1157  1166  1170  1157  1164  1167  1158
unique     2     2     2     2     2     2     2     2
top      f01   f10   f20   f31   f40   f50   f61   f71
freq    1019  1017   756   714   890   749   627   826

Missing Values in Numerical Columns:
f0    35
f1    43
f2    34
f3    30
f4    43
f5    36
f6    33
f7    42
dtype: int64


In [91]:
import matplotlib.pyplot as plt
# Analyze the numerical columns (f8~f15) for statistical properties and missing data

# List of numerical columns
numerical_columns = ['f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15']

# Display descriptive statistics for numerical columns
numerical_stats = data[numerical_columns].describe()
print("Descriptive Statistics for Numerical Columns:")
print(numerical_stats)

# Check for missing values in numerical columns
missing_values_numerical = data[numerical_columns].isnull().sum()
print("\nMissing Values in Numerical Columns:")
print(missing_values_numerical)


Descriptive Statistics for Numerical Columns:
            f8      f9     f10     f11     f12    f13     f14     f15
count     1163    1168    1158    1153    1157   1171    1159    1168
unique    1112    1143    1104    1115    1140   1151    1105    1161
top     0.0506  1.3125  1.0008  0.3868  0.9563  0.496  1.1708  0.9241
freq         3       2       3       3       2      2       3       2

Missing Values in Numerical Columns:
f8     37
f9     32
f10    42
f11    47
f12    43
f13    29
f14    41
f15    32
dtype: int64


In [92]:
# Categorical columns to check
categorical_columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7']

# Fill NaN values with the most frequent value in each column
for col in categorical_columns:
    most_frequent_value = data[col].mode()[0]
    data[col].fillna(most_frequent_value, inplace=True)

# Verify that there are no missing values left in the specified columns
print(data[categorical_columns].isnull().sum())

f0    0
f1    0
f2    0
f3    0
f4    0
f5    0
f6    0
f7    0
dtype: int64


In [93]:
# Columns to fill with mean values
columns_to_fill = ['f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15']

# Fill missing values with column mean
for col in columns_to_fill:
    data[col] = data[col].astype(float)  # Ensure the column is of float type
    data[col].fillna(data[col].mean(), inplace=True)

# Verify that there are no missing values left in the specified columns
print(data[columns_to_fill].isnull().sum())

data

f8     0
f9     0
f10    0
f11    0
f12    0
f13    0
f14    0
f15    0
dtype: int64


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18
0,0,f01,f10,f21,f31,f41,f51,f60,f70,0.650200,1.039200,0.8372,0.4417,0.9828,5.0786,0.9674,10.5419,f160,f170,f180
1,1,f01,f10,f20,f30,f41,f50,f61,f71,0.496500,0.221900,0.8066,1.0555,1.0774,3.0361,0.9684,10.7465,f161,f170,f180
2,2,f01,f10,f21,f31,f40,f51,f60,f71,1.028100,0.648200,0.4687,0.3296,1.9474,1.5626,0.0610,2.8468,f160,f170,f180
3,3,f01,f10,f20,f30,f40,f51,f60,f70,0.679981,0.309200,1.2234,0.3484,2.5767,1.4449,0.9238,12.2682,f160,f170,f181
4,4,f01,f10,f20,f31,f40,f51,f61,f71,0.746400,0.913385,0.1788,1.1686,1.0778,3.6329,1.0652,9.7108,f160,f170,f181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1195,f01,f10,f21,f31,f40,f50,f60,f71,0.307000,0.072700,0.6140,0.1230,0.9269,1.2372,1.0087,2.6803,f160,f170,f180
1196,1196,f01,f10,f21,f30,f40,f50,f60,f71,1.148200,0.189100,0.5938,0.0750,1.6252,1.0328,0.3019,4.0333,f161,f171,f180
1197,1197,f01,f11,f20,f30,f40,f50,f61,f71,0.056000,0.131400,0.2242,0.3124,0.8023,0.6861,0.8151,2.7759,f160,f170,f180
1198,1198,f01,f10,f21,f31,f40,f51,f61,f71,0.485300,0.060100,0.9925,0.4389,2.0334,0.1627,0.0531,8.1689,f161,f171,f181


In [94]:
file_path = r'C:\\Users\\nonohuang\\OneDrive\\桌面\\kaggle\\kaggle\\task4\\introml_2024_task4_test_NO_answers_shuffled.csv'
if os.path.exists(file_path):
    print("File exists")
    data = pd.read_csv(file_path)
    print(data.head())
else:
    print("File does not exist")
data.replace('?', np.nan, inplace=True)
data.info()
missing_values = data.isnull().sum()
print(missing_values)

File exists
   id   f0   f1   f2   f3   f4   f5   f6   f7      f8      f9     f10     f11  \
0   0    ?  f10  f21  f31  f41  f51  f60  f70  0.6502  1.0392  0.8372  0.4417   
1   1  f01  f10  f20  f30  f41  f50  f61  f71  0.4965  0.2219  0.8066  1.0555   
2   2  f01  f10  f21  f31  f40  f51  f60  f71  1.0281  0.6482  0.4687  0.3296   
3   3  f01  f10  f20  f30  f40  f51  f60  f70       ?  0.3092  1.2234  0.3484   
4   4  f01  f10  f20  f31  f40  f51  f61  f71  0.7464       ?  0.1788  1.1686   

      f12     f13     f14      f15   f16   f17   f18  
0  0.9828  5.0786  0.9674  10.5419  f160  f170  f180  
1  1.0774  3.0361  0.9684  10.7465  f161  f170  f180  
2  1.9474  1.5626   0.061   2.8468  f160  f170  f180  
3  2.5767  1.4449  0.9238  12.2682  f160  f170  f181  
4  1.0778  3.6329  1.0652   9.7108  f160  f170  f181  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  -------------- 

In [95]:
categorical_columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f16', 'f17', 'f18']
for col in categorical_columns:
    most_frequent_value = data[col].mode()[0]
    data[col].fillna(most_frequent_value, inplace=True)
print(data[categorical_columns].isnull().sum())

columns_to_fill = ['f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15']
for col in columns_to_fill:
    data[col] = data[col].astype(float)
    data[col].fillna(data[col].mean(), inplace=True)
print(data[columns_to_fill].isnull().sum())

f0     0
f1     0
f2     0
f3     0
f4     0
f5     0
f6     0
f7     0
f16    0
f17    0
f18    0
dtype: int64


f8     0
f9     0
f10    0
f11    0
f12    0
f13    0
f14    0
f15    0
dtype: int64


In [96]:
# 保留 id 特徵
X_test = data.copy()

# 進行頻率編碼
for col in categorical_columns:
    freq_encoding = X_test[col].value_counts() / len(X_test)
    X_test[col] = X_test[col].map(freq_encoding)

# 確保所有特徵列都已經正確編碼為數值類型
for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col] = X_test[col].astype(float)

# 確保特徵名稱與訓練時一致
X_test = X_test[X.columns]

# 預測
test_predictions = best_model.predict(X_test)

# 將預測結果保存到csv文件
submission = pd.DataFrame({'id': data['id'], 'class': test_predictions})
submission.to_csv(r'C:\\Users\\nonohuang\\OneDrive\\桌面\\kaggle\\kaggle\\task4\\submission\\submission.csv', index=False)

print("Predictions saved to submission.csv")

Predictions saved to submission.csv
