## **載入必要套件**

In [58]:
import numpy as np 
import pandas as pd 
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
import category_encoders as ce


## **讀取資料**

In [59]:
file_path = r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task1\introml_2024_task1_train.csv'

if os.path.exists(file_path):
    print("File exists")
    data = pd.read_csv(r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task1\introml_2024_task1_train.csv')
    print(data)
else:
    print("File does not exist")

File exists
       f0   f1   f2   f3   f4   f5   f6   f7   f8   f9  ...   f11   f12   f13  \
0     f01  f11  f21  f31  f40  f50  f61  f71  f81  f90  ...  f111  f121  f130   
1     f00  f10  f21  f31  f40  f51  f60  f71  f81  f91  ...  f111  f121  f131   
2     f01  f10  f21  f30  f40  f50  f61  f71  f81  f90  ...  f111  f121  f131   
3     f00  f10  f21  f30  f40  f51  f60  f71  f80  f90  ...  f111  f121  f130   
4     f00  f11  f21  f30  f40  f50  f60  f71  f81  f90  ...  f111  f121  f131   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   ...   
2395  f00  f10  f21  f31  f40  f50  f60  f70  f80  f90  ...  f111  f121  f131   
2396  f01  f10  f21  f30  f40  f51  f60  f70  f80  f90  ...  f111  f121  f131   
2397  f00  f11  f21  f31  f41  f51  f60  f71  f81  f90  ...  f111  f121  f131   
2398  f00  f11  f21  f31  f41  f51  f60  f71  f80  f91  ...  f111  f121  f131   
2399  f00  f11  f20  f30  f40  f50  f60  f71  f81  f90  ...  f111  f121  f131   

       f14   f1

## **檢查資料缺失值並補齊**

In [60]:
# 檢查是否有缺失值
print(data.isnull().sum())

# 處理缺失值（這裡我們選擇用均值填補缺失值）
data.fillna(data.mean(), inplace=True)

f0       0
f1       0
f2       0
f3       0
f4       0
f5       0
f6       0
f7       0
f8       0
f9       0
f10      0
f11      0
f12      0
f13      0
f14      0
f15      0
f16      0
f17      0
f18      0
f19      0
class    0
dtype: int64


  data.fillna(data.mean(), inplace=True)


In [61]:
# 分割特徵和標籤
X = data.drop('class', axis=1)
y = data['class']

# 使用FrequencyEncoder對類別型特徵進行編碼
encoder = ce.CountEncoder()
X_encoded = encoder.fit_transform(X)
# 標籤編碼
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [62]:
X_encoded
y_encoded

array([0, 0, 0, ..., 2, 2, 2])

In [63]:
# 分割資料為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [64]:
# 建立SVC分類模型
model = SVC()

# 進行交叉驗證
scores = cross_val_score(model, X_encoded, y_encoded, cv=5)

model.fit(X_train, y_train)

In [65]:
# 輸出每次交叉驗證的準確率
print(f'Cross-validation scores: {scores}')

# 輸出平均準確率
print(f'Average cross-validation score: {scores.mean()}')

Cross-validation scores: [0.83541667 0.78541667 0.80625    0.775      0.80625   ]
Average cross-validation score: 0.8016666666666667


In [66]:
file_path = r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task1\introml_2024_task1_test_NO_answers_shuffled.csv'

if os.path.exists(file_path):
    print("File exists")
    test_data = pd.read_csv(file_path)
    print(test_data)
else:
    print("File does not exist")


File exists
      id   f0   f1   f2   f3   f4   f5   f6   f7   f8  ...   f10   f11   f12  \
0      0  f00  f10  f21  f31  f41  f50  f61  f71  f80  ...  f100  f110  f121   
1      1  f00  f10  f21  f31  f41  f50  f61  f71  f81  ...  f101  f111  f120   
2      2  f00  f10  f21  f30  f40  f51  f61  f70  f81  ...  f100  f111  f121   
3      3  f00  f10  f21  f30  f41  f51  f60  f71  f80  ...  f101  f111  f121   
4      4  f00  f10  f20  f31  f40  f50  f60  f71  f81  ...  f100  f111  f121   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   ...   
595  595  f01  f10  f21  f30  f41  f50  f60  f70  f81  ...  f100  f111  f121   
596  596  f00  f11  f21  f30  f40  f50  f60  f71  f81  ...  f100  f111  f121   
597  597  f00  f10  f21  f30  f40  f50  f60  f71  f81  ...  f100  f111  f121   
598  598  f00  f10  f21  f30  f40  f50  f60  f70  f81  ...  f100  f111  f121   
599  599  f00  f11  f20  f30  f40  f50  f60  f71  f81  ...  f100  f111  f121   

      f13   f14   f15   f16

In [67]:
# 使用FrequencyEncoder對類別型特徵進行編碼
X_test_encoded = encoder.transform(test_data.drop(columns=['id']))
# 預測
test_predictions = model.predict(X_test_encoded)
# 將預測結果轉換回原始標籤
test_predictions_labels = label_encoder.inverse_transform(test_predictions)


In [68]:
# 將預測結果保存到csv文件
submission = pd.DataFrame({'id': test_data['id'], 'class': test_predictions_labels})
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")

Predictions saved to submission.csv
