## **載入必要套件**

In [6]:
import numpy as np 
import pandas as pd 
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
import category_encoders as ce


## **讀取資料**

In [None]:
file_path = r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task1\introml_2024_task1_train.csv'

if os.path.exists(file_path):
    print("File exists")
    data = pd.read_csv(r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task1\introml_2024_task1_train.csv')
    print(data)
else:
    print("File does not exist")

## **檢查資料缺失值並補齊**

In [None]:
# 檢查是否有缺失值
print(data.isnull().sum())

# 處理缺失值（這裡我們選擇用均值填補缺失值）
data.fillna(data.mean(), inplace=True)

In [None]:
# 分割特徵和標籤
X = data.drop('class', axis=1)
y = data['class']

# 使用FrequencyEncoder對類別型特徵進行編碼
encoder = ce.CountEncoder()
X_encoded = encoder.fit_transform(X)
# 標籤編碼
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
X_encoded
y_encoded

In [None]:
# 分割資料為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [None]:
# 建立SVC分類模型
model = SVC()

# 進行交叉驗證
scores = cross_val_score(model, X_encoded, y_encoded, cv=5)

model.fit(X_train, y_train)

In [None]:
# 輸出每次交叉驗證的準確率
print(f'Cross-validation scores: {scores}')

# 輸出平均準確率
print(f'Average cross-validation score: {scores.mean()}')

In [None]:
file_path = r'C:\Users\nonohuang\OneDrive\桌面\kaggle\kaggle\task1\introml_2024_task1_test_NO_answers_shuffled.csv'

if os.path.exists(file_path):
    print("File exists")
    test_data = pd.read_csv(file_path)
    print(test_data)
else:
    print("File does not exist")


In [None]:
# 使用FrequencyEncoder對類別型特徵進行編碼
X_test_encoded = encoder.transform(test_data.drop(columns=['id']))
# 預測
test_predictions = model.predict(X_test_encoded)
# 將預測結果轉換回原始標籤
test_predictions_labels = label_encoder.inverse_transform(test_predictions)


In [None]:
# 將預測結果保存到csv文件
submission = pd.DataFrame({'id': test_data['id'], 'class': test_predictions_labels})
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")