In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import glob
import os
from scipy.stats import skew, kurtosis # 引入偏度和峰度
from scipy.signal import find_peaks # 引入尋找峰值
# 移除 GridSearchCV 相關引入
# from sklearn.model_selection import GridSearchCV # 引入網格搜索
from sklearn.model_selection import train_test_split # 引入訓練集測試集分割
from sklearn.metrics import accuracy_score, log_loss, f1_score # 引入更多評估指標

# 讀取 train_info (保持不變)
train_info = pd.read_csv('/content/drive/MyDrive/AI_CUP/39_Training_Dataset/train_info.csv')

# 特徵工程函數：增加更多統計量和時間序列特徵
def extract_features(file_path):
    try:
        data = np.loadtxt(file_path)
        if data.ndim == 1:
            data = data.reshape(-1, 6)  # 確保是 2D
        # 確保數據非空
        if data.size == 0:
             print(f"⚠️ 空數據檔案：{file_path}")
             return None
    except Exception as e:
        print(f"❌ 讀取錯誤：{file_path}, 錯誤訊息: {e}")
        return None

    features = {}
    columns = ['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz']
    for i, col in enumerate(columns):
        series = data[:, i]

        # 基本統計特徵 (保留原來的)
        features[f'{col}_mean'] = np.mean(series)
        features[f'{col}_std'] = np.std(series)
        features[f'{col}_max'] = np.max(series)
        features[f'{col}_min'] = np.min(series)
        features[f'{col}_range'] = np.max(series) - np.min(series) # 範圍
        features[f'{col}_median'] = np.median(series) # 中位數
        features[f'{col}_variance'] = np.var(series) # 方差
        features[f'{col}_skewness'] = skew(series) if len(series) > 1 else 0 # 偏度 (確保數據點多於1)
        features[f'{col}_kurtosis'] = kurtosis(series) if len(series) > 1 else 0 # 峰度 (確保數據點多於1)
        features[f'{col}_rms'] = np.sqrt(np.mean(series**2)) # 均方根

        # 時間序列特徵
        # 過零率 (Zero-Crossing Rate)
        zero_crossings = np.where(np.diff(np.sign(series)))[0]
        features[f'{col}_zero_crossing_rate'] = len(zero_crossings) / len(series) if len(series) > 0 else 0

        # 峰值數量 (Number of Peaks)
        peaks, _ = find_peaks(series)
        features[f'{col}_num_peaks'] = len(peaks)

        # 也可以考慮加入能量或功率特徵
        # energy = np.sum(series**2)
        # features[f'{col}_energy'] = energy

    # 軸之間的關係
    if data.shape[0] > 0:
        # 確保數據點多於1才能計算相關係數
        if data.shape[0] > 1:
            features['Ax_Ay_corr'] = np.corrcoef(data[:, 0], data[:, 1])[0, 1]
            features['Ax_Az_corr'] = np.corrcoef(data[:, 0], data[:, 2])[0, 1]
            features['Ay_Az_corr'] = np.corrcoef(data[:, 1], data[:, 2])[0, 1]
            features['Gx_Gy_corr'] = np.corrcoef(data[:, 3], data[:, 4])[0, 1]
            features['Gx_Gz_corr'] = np.corrcoef(data[:, 3], data[:, 5])[0, 1]
            features['Gy_Gz_corr'] = np.corrcoef(data[:, 4], data[:, 5])[0, 1]
        else:
            features['Ax_Ay_corr'] = 0
            features['Ax_Az_corr'] = 0
            features['Ay_Az_corr'] = 0
            features['Gx_Gy_corr'] = 0
            features['Gx_Gz_corr'] = 0
            features['Gy_Gz_corr'] = 0


        # 總加速度模長
        total_accel_magnitude = np.sqrt(data[:, 0]**2 + data[:, 1]**2 + data[:, 2]**2)
        features['total_accel_mean'] = np.mean(total_accel_magnitude)
        features['total_accel_std'] = np.std(total_accel_magnitude)
        features['total_accel_max'] = np.max(total_accel_magnitude)
        features['total_accel_min'] = np.min(total_accel_magnitude)


    return features

# 批次處理訓練檔案
feature_list = []
train_files = glob.glob('/content/drive/MyDrive/AI_CUP/39_Training_Dataset/train_data/*.txt')
for file in train_files:
    uid = os.path.basename(file).replace('.txt', '')
    feats = extract_features(file)
    if feats is not None: # 確保成功提取特徵
        feats['unique_id'] = int(uid)
        feature_list.append(feats)

feature_df = pd.DataFrame(feature_list)
print("增加更多特徵後的數據類型:")
print(feature_df.dtypes)
print("增加更多特徵後的數據頭部:")
print(feature_df.head())


# 合併特徵與標籤
data = pd.merge(feature_df, train_info, on='unique_id')

# 目標欄位設定
targets = {
    'gender': 'gender',
    'hold racket handed': 'hold racket handed',
    'years': 'play years',
    'level': 'level'
}

# 取得特徵欄位
feature_cols = data.select_dtypes(include=['float64', 'float32']).columns.tolist()
# 從特徵列中移除目標列和 unique_id
feature_cols = [col for col in feature_cols if col not in list(targets.values()) + ['unique_id']]

print(f"\n使用的特徵列數量: {len(feature_cols)}")

# **刪除超參數調整部分**
# 原先的超參數調整代碼已移除


# **直接訓練最終模型**
# 這裡使用您手動指定的參數訓練模型
final_models = {}

# 訓練 gender 模型
print("\n訓練最終的 gender 模型...")
model_gender = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    verbose=0,
    random_seed=42,
    # task_type='GPU',  # Commented out to run on CPU
    # devices='0'       # Commented out to run on CPU
)
final_models['gender'] = model_gender.fit(data[feature_cols], data[targets['gender']])


# 訓練 hold racket handed 模型
print("訓練最終的 hold racket handed 模型...")
model_handed = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    loss_function='Logloss',
    verbose=0,
    random_seed=42,
    # task_type='GPU',  # Commented out to run on CPU
    # devices='0'       # Commented out to run on CPU
)
final_models['hold racket handed'] = model_handed.fit(data[feature_cols], data[targets['hold racket handed']])


# 訓練 years 模型
print("訓練最終的 years 模型...")
model_years = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    verbose=0,
    random_seed=42,
    # task_type='GPU',  # Commented out to run on CPU
    # devices='0'       # Commented out to run on CPU
)
final_models['years'] = model_years.fit(data[feature_cols], data[targets['years']])

# 訓練 level 模型
print("訓練最終的 level 模型...")
model_level = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    verbose=0,
    random_seed=42,
    # task_type='GPU',  # Commented out to run on CPU
    # devices='0'       # Commented out to run on CPU
)
final_models['level'] = model_level.fit(data[feature_cols], data[targets['level']])


# 處理測試集 (需要使用新的 extract_features 函數)
test_files = sorted(glob.glob('/content/drive/MyDrive/AI_CUP/39_Test_Dataset/test_data/*.txt'))
test_features = []
for file in test_files:
    uid = os.path.basename(file).replace('.txt', '')
    feats = extract_features(file) # 使用新的 extract_features
    if feats is not None: # 確保成功提取特徵
        feats['unique_id'] = int(uid)
        test_features.append(feats)

test_df = pd.DataFrame(test_features)

# 測試集的特徵列應該與訓練集一致
X_test = test_df[feature_cols] # 使用訓練集相同的特徵列


# 預測 (保持不變)
submission = pd.DataFrame()
submission['unique_id'] = test_df['unique_id']

# 二元分類處理
for target in ['gender', 'hold racket handed']:
    if target in final_models:
        # Use the same approach to get class index
        # Ensure class 1 exists in the classes_ attribute
        if 1 in final_models[target].classes_:
            cls_idx = list(final_models[target].classes_).index(1)
            submission[target] = final_models[target].predict_proba(X_test)[:, cls_idx]
        else:
             # Handle cases where class 1 might not be present in training data for this fold/model
             print(f"⚠️ 目標 '{target}' 在訓練集中可能缺少類別 1。預測機率可能不包含類別 1。")
             # Depending on requirements, you might need to handle this case differently,
             # e.g., predict raw class or assign a default probability.
             # For now, setting the probability to 0 as a placeholder if class 1 isn't found.
             submission[target] = 0.0


# 球齡（三分類）
if 'years' in final_models:
    years_proba = final_models['years'].predict_proba(X_test)
    for i, cls in enumerate(final_models['years'].classes_):
        submission[f'play years_{cls}'] = years_proba[:, i]

# 等級（四分類）
if 'level' in final_models:
    level_proba = final_models['level'].predict_proba(X_test)
    for i, cls in enumerate(final_models['level'].classes_):
        submission[f'level_{cls}'] = level_proba[:, i]

submission = submission.round(5)

# 輸出結果
submission.to_csv('/content/drive/MyDrive/AI_CUP/submission_advanced.csv', index=False, float_format='%.5f')
print("\n✅ 預測完成，結果已儲存至 submission_advanced.csv")

增加更多特徵後的數據類型:
Ax_mean             float64
Ax_std              float64
Ax_max              float64
Ax_min              float64
Ax_range            float64
                     ...   
total_accel_mean    float64
total_accel_std     float64
total_accel_max     float64
total_accel_min     float64
unique_id             int64
Length: 83, dtype: object
增加更多特徵後的數據頭部:
       Ax_mean       Ax_std   Ax_max   Ax_min  Ax_range  Ax_median  \
0 -3146.685385  2219.015363    892.0 -15598.0   16490.0    -2594.0   
1  3221.629355  4064.627333  18426.0  -6957.0   25383.0     2211.0   
2  4220.467986  5093.116190  23700.0  -1889.0   25589.0     2096.0   
3  3392.260139  2601.734034  10127.0  -1337.0   11464.0     2545.0   
4  3099.495894  2136.362474  10121.0   -287.0   10408.0     2612.0   

    Ax_variance  Ax_skewness  Ax_kurtosis       Ax_rms  ...  Ax_Az_corr  \
0  4.924029e+06    -1.042162     0.617539  3850.410120  ...    0.631420   
1  1.652120e+07     0.953767     0.980418  5186.529770  ...    0.41