## **Data mining final project**

### **寶可夢數值預測**
- 利用名字、屬性預測其種族值
  - 特徵
    - 名字
    - 種族1
    - 種族2
  - 目標
    - 種族值

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [65]:
pokemon = pd.read_csv('pokedex.csv', encoding='big5')

pokemon

Unnamed: 0,Image,Index,English Name,Chinese name,Normal,Fighting,Flying,Poison,Ground,Rock,...,Dark,Fairy,Total,HP,Attack,Defense,SP. Atk.,SP. Def,Speed,Legendary
0,images/1.png,1,Bulbasaur,妙蛙種子,0,0,0,1,0,0,...,0,0,318,45,49,49,65,65,45,0
1,images/2.png,2,Ivysaur,妙蛙草,0,0,0,1,0,0,...,0,0,405,60,62,63,80,80,60,0
2,images/3.png,3,Venusaur,妙蛙花,0,0,0,1,0,0,...,0,0,525,80,82,83,100,100,80,0
3,images/4.png,3,Mega Venusaur,MEGA妙蛙花,0,0,0,1,0,0,...,0,0,625,80,100,123,122,120,80,0
4,images/5.png,4,Charmander,小火龍,0,0,0,0,0,0,...,0,0,309,39,52,43,60,50,65,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1210,images/1211.png,1023,Iron Crown,鐵頭殼,0,0,0,0,0,0,...,0,0,590,90,72,100,122,108,98,1
1211,images/1212.png,1024,Terapagos Normal Form,太樂巴戈斯 普通形態,1,0,0,0,0,0,...,0,0,450,90,65,85,65,85,60,1
1212,images/1213.png,1024,Terapagos Terastal Form,太樂巴戈斯 太晶形態,1,0,0,0,0,0,...,0,0,600,95,95,110,105,110,85,1
1213,images/1214.png,1024,Terapagos Stellar Form,太樂巴戈斯 星晶形態,1,0,0,0,0,0,...,0,0,700,160,105,110,130,110,85,1


### **XGBoost**
##### **Use English name, Type1, Type2, Legendary to predict 6 value ( HP, attack defence, sp.atk, sp.def, speed )**
##### **Use Grid Search to find the hyper paremeter for each XGBoost predict model**
HP:
  Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 10.0, 'subsample': 0.8}

Attack:
  Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 10.0, 'subsample': 0.8}

Defense:
  Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1.0, 'subsample': 0.6}

SP. Atk.:
  Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 10.0, 'subsample': 0.8}

SP. Def:
  Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'subsample': 0.6}

Speed:
  Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'subsample': 0.6}

In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# 編碼 English Name 欄位
label_encoder = LabelEncoder()
pokemon['English Name Encoded'] = label_encoder.fit_transform(pokemon['English Name'])

# 確保所有特徵都是數值型
features = [col for col in pokemon.columns if col not in [
    'Image', 'Index', 'English Name', 'Chinese name', 'Total',
    'HP', 'Attack', 'Defense', 'SP. Atk.', 'SP. Def', 'Speed', 'Legendary'
]]

# 確保特徵全為數值型
X = pokemon[features]
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

# 定義目標值
targets = ['HP', 'Attack', 'Defense', 'SP. Atk.', 'SP. Def', 'Speed']

# 每個目標值的最佳參數
best_params_per_target = {
    'HP': {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 10.0, 'subsample': 0.8},
    'Attack': {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 10.0, 'subsample': 0.8},
    'Defense': {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1.0, 'subsample': 0.6},
    'SP. Atk.': {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 10.0, 'subsample': 0.8},
    'SP. Def': {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'subsample': 0.6},
    'Speed': {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'subsample': 0.6}
}

# 儲存結果
xgb_final_results = {}

# 使用最佳參數進行預測
for target in targets:
    y = pokemon[target].apply(pd.to_numeric, errors='coerce').fillna(0)
    
    # 分割資料集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 使用對應的最佳參數初始化 XGBoost 模型
    best_params = best_params_per_target[target]
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    xgb_model.fit(X_train, y_train)
    
    # 預測與評估
    y_pred = xgb_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # 儲存結果
    xgb_final_results[target] = {
        'rmse': rmse,
        'predictions': y_pred[:5]  # 儲存前五個預測值
    }

# 輸出結果
for target, result in xgb_final_results.items():
    print(f"{target}:")
    print(f"  Root Mean Squared Error: {result['rmse']:.2f}")
    print(f"  Predictions: {result['predictions']}")
    print()


HP:
  Root Mean Squared Error: 22.57
  Predictions: [76.15513  72.59458  69.34144  69.984215 85.807076]

Attack:
  Root Mean Squared Error: 30.48
  Predictions: [82.46661 77.80936 64.50413 73.37535 74.20354]

Defense:
  Root Mean Squared Error: 28.21
  Predictions: [73.613976 69.31342  63.30112  67.14781  61.372322]

SP. Atk.:
  Root Mean Squared Error: 27.77
  Predictions: [60.310413 98.793594 65.72345  72.06292  61.11575 ]

SP. Def:
  Root Mean Squared Error: 25.57
  Predictions: [67.45473 79.09104 69.93683 70.89569 68.95411]

Speed:
  Root Mean Squared Error: 28.43
  Predictions: [63.194744 77.35519  70.03602  71.51669  63.83572 ]



### **ResNet50**

In [7]:
import pandas as pd
import numpy as np
import os
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 讀取CSV數據
file_path = 'pokedex.csv'
pokemon = pd.read_csv('pokedex.csv', encoding='big5')

# 檢查image欄位的圖片路徑
image_dir = 'images/'  # 假設圖片存儲在這個資料夾中
pokemon['image_path'] = pokemon['Image'].apply(lambda x: os.path.join(image_dir, x) if isinstance(x, str) else None)

# 加載預訓練的ResNet50模型，並移除全連接層以獲取特徵向量
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
resnet_model = resnet50(pretrained=True)
resnet_model.fc = torch.nn.Identity()  # 移除全連接層
resnet_model = resnet_model.to(device)
resnet_model.eval()

# 定義圖片轉換和特徵提取函數
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_image_features_pytorch(image_path):
    try:
        img = Image.open(image_path).convert('RGB')
        img_tensor = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            features = resnet_model(img_tensor)
        return features.cpu().numpy().flatten()
    except Exception as e:
        return np.zeros((2048,))  # 如果圖片無法載入，返回零向量

# 提取所有圖片特徵
pokemon['image_features'] = pokemon['image_path'].apply(extract_image_features_pytorch)

# 展開圖片特徵並與數據框合併
image_features = np.vstack(pokemon['image_features'].values)
image_features_df = pd.DataFrame(image_features, columns=[f'img_feat_{i}' for i in range(image_features.shape[1])])
pokemon = pd.concat([pokemon.reset_index(drop=True), image_features_df], axis=1)

print(pokemon.columns)

# 移除不需要的欄位，確保所有特徵為數值型
features = [col for col in pokemon.columns if col not in [
    'Image', 'Index', 'English Name', 'Chinese name', 'Total', 'HP', 'Attack',
    'Defense', 'SP. Atk.', 'SP. Def', 'Speed', 'image_path'
]]

# 定義數值特徵與目標值
X = pokemon[features]
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
targets = ['HP', 'Attack', 'Defense', 'SP. Atk.', 'SP. Def', 'Speed']

# 儲存結果
xgb_final_results_with_images = {}

# 每個目標值的最佳參數
best_params_per_target = {
    'HP': {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 10.0, 'subsample': 0.8},
    'Attack': {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 10.0, 'subsample': 0.8},
    'Defense': {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1.0, 'subsample': 0.6},
    'SP. Atk.': {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0.1, 'reg_lambda': 10.0, 'subsample': 0.8},
    'SP. Def': {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'subsample': 0.6},
    'Speed': {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'subsample': 0.6}
}

# 使用最佳參數進行預測
for target in targets:
    y = pokemon[target].apply(pd.to_numeric, errors='coerce').fillna(0)
    
    # 分割資料集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 使用對應的最佳參數初始化 XGBoost 模型
    best_params = best_params_per_target[target]
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    xgb_model.fit(X_train, y_train)
    
    # 預測與評估
    y_pred = xgb_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    # 儲存結果
    xgb_final_results_with_images[target] = {
        'rmse': rmse,
        'predictions': y_pred[:5]  # 儲存前五個預測值
    }

# 顯示結果
xgb_final_results_with_images




Index(['Image', 'Index', 'English Name', 'Chinese name', 'Normal', 'Fighting',
       'Flying', 'Poison', 'Ground', 'Rock',
       ...
       'img_feat_2038', 'img_feat_2039', 'img_feat_2040', 'img_feat_2041',
       'img_feat_2042', 'img_feat_2043', 'img_feat_2044', 'img_feat_2045',
       'img_feat_2046', 'img_feat_2047'],
      dtype='object', length=2080)


{'HP': {'rmse': 21.126325205585132,
  'predictions': array([75.58118, 69.47629, 67.31301, 67.87396, 78.5401 ], dtype=float32)},
 'Attack': {'rmse': 29.492922129329383,
  'predictions': array([80.00818 , 62.756943, 69.89932 , 68.62924 , 70.29132 ],
        dtype=float32)},
 'Defense': {'rmse': 27.746378193307045,
  'predictions': array([72.26491 , 65.75871 , 63.02259 , 66.24035 , 63.123528],
        dtype=float32)},
 'SP. Atk.': {'rmse': 25.477021612124314,
  'predictions': array([ 62.12312, 102.10347,  61.67697,  66.16504,  66.0623 ],
        dtype=float32)},
 'SP. Def': {'rmse': 24.52176971417481,
  'predictions': array([67.98766 , 77.54687 , 68.62617 , 66.868805, 66.157776],
        dtype=float32)},
 'Speed': {'rmse': 26.6725848745233,
  'predictions': array([60.915733, 70.84258 , 67.76022 , 66.117905, 69.476326],
        dtype=float32)}}