## **Data mining final project**

### **寶可夢數值預測**
- 利用名字、屬性預測其種族值
  - 特徵
    - 名字
    - 種族1
    - 種族2
  - 目標
    - 種族值

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [53]:
pokemon = pd.read_csv('pokedex.csv', encoding='big5')

pokemon

Unnamed: 0,Image,Index,English Name,Chinese name,Type 1,Type 2,Total,HP,Attack,Defense,SP. Atk.,SP. Def,Speed,Legendary
0,images/1.png,1,Bulbasaur,妙蛙種子,Grass,Poison,318,45,49,49,65,65,45,False
1,images/2.png,2,Ivysaur,妙蛙草,Grass,Poison,405,60,62,63,80,80,60,False
2,images/3.png,3,Venusaur,妙蛙花,Grass,Poison,525,80,82,83,100,100,80,False
3,images/4.png,3,Mega Venusaur,MEGA妙蛙花,Grass,Poison,625,80,100,123,122,120,80,False
4,images/5.png,4,Charmander,小火龍,Fire,,309,39,52,43,60,50,65,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1210,images/1211.png,1023,Iron Crown,鐵頭殼,Steel,Psychic,590,90,72,100,122,108,98,True
1211,images/1212.png,1024,Terapagos Normal Form,太樂巴戈斯 普通形態,Normal,,450,90,65,85,65,85,60,True
1212,images/1213.png,1024,Terapagos Terastal Form,太樂巴戈斯 太晶形態,Normal,,600,95,95,110,105,110,85,True
1213,images/1214.png,1024,Terapagos Stellar Form,太樂巴戈斯 星晶形態,Normal,,700,160,105,110,130,110,85,True


### **random forest regressor**

In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# 特徵與目標值
features = ['English Name', 'Type 1', 'Type 2', 'Legendary']
target = 'Total'

X = pokemon[features]
y = pokemon[target]

# 對 'English Name' 使用 Label Encoding
label_encoder = LabelEncoder()
X['English Name'] = label_encoder.fit_transform(X['English Name'])

# 分割訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定義資料前處理
preprocessor = ColumnTransformer(
    transformers=[
        ('type', OneHotEncoder(handle_unknown='ignore'), ['Type 1', 'Type 2']),
        ('legendary', OneHotEncoder(handle_unknown='ignore'), ['Legendary'])
    ],
    remainder='passthrough'
)

# 建立處理流程與模型
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# 訓練模型
pipeline.fit(X_train, y_train)

# 預測
y_pred = pipeline.predict(X_test)

# 計算均方誤差
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

# 輸出預測值
print(y_pred)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['English Name'] = label_encoder.fit_transform(X['English Name'])


Mean Squared Error: 11365.67
[340.79 487.54 335.86 485.14 440.29 424.6  418.36 384.8  456.64 390.2
 377.04 454.06 427.9  581.2  331.31 342.95 449.61 468.23 460.95 418.82
 350.45 622.4  299.21 438.87 405.14 402.54 650.25 343.69 316.93 345.44
 269.47 395.26 501.28 403.09 444.53 562.   621.75 459.28 516.01 435.06
 363.71 478.23 608.66 471.78 458.69 456.47 383.96 581.2  445.46 638.28
 445.94 469.98 533.59 456.75 409.62 481.11 442.25 460.48 557.6  494.85
 435.13 460.69 453.61 386.63 549.6  453.64 432.88 420.18 420.25 463.08
 323.4  363.3  474.11 543.66 479.1  423.05 479.09 431.76 365.34 583.07
 456.19 403.02 385.17 443.97 576.42 406.1  416.25 231.6  440.9  290.6
 381.22 451.57 391.73 409.8  454.59 408.66 479.6  675.6  482.24 435.04
 476.16 587.6  430.77 453.84 565.1  475.51 583.9  434.51 435.17 267.96
 489.41 422.98 381.05 348.71 430.68 337.16 441.37 444.19 391.52 425.36
 387.28 642.7  398.8  392.98 484.69 441.07 501.73 346.75 490.25 447.29
 666.1  324.04 451.47 401.71 456.29 392.94 385.45 

### **XG boost**

In [55]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# 特徵與目標值
features = ['English Name', 'Type 1', 'Type 2', 'Legendary']
target = 'Total'

X = pokemon[features]
y = pokemon[target]

# 對 'English Name' 使用 Label Encoding
label_encoder = LabelEncoder()
X['English Name'] = label_encoder.fit_transform(X['English Name'])

# 分割訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定義資料前處理
preprocessor = ColumnTransformer(
    transformers=[
        ('type', OneHotEncoder(handle_unknown='ignore'), ['Type 1', 'Type 2']),
        ('legendary', OneHotEncoder(handle_unknown='ignore'), ['Legendary'])
    ],
    remainder='passthrough'
)

# 建立處理流程與模型
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])

# 訓練模型
pipeline.fit(X_train, y_train)

# 預測
y_pred = pipeline.predict(X_test)

# 計算均方誤差
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

# 輸出預測值
print(y_pred)


Mean Squared Error: 12294.94
[316.63998 446.22824 337.41916 434.95444 387.82028 352.77667 457.25113
 363.28906 413.11655 411.68713 341.6947  378.82355 462.35883 560.6157
 274.13882 359.73926 435.03198 431.47348 458.48328 411.66742 344.59546
 643.4743  305.84958 439.58438 381.9504  418.4683  687.91516 290.2051
 355.02524 359.87668 329.08002 387.7703  399.5585  402.11606 471.154
 441.28873 594.48157 487.58646 485.4624  446.18982 337.41916 467.59592
 614.7889  417.9594  516.63525 434.2915  297.68762 560.6157  452.44385
 653.91986 445.15598 468.14148 565.958   482.25757 335.7488  469.25082
 412.8022  460.75015 434.103   443.90625 416.27856 411.66077 423.90192
 453.11588 563.0351  495.80948 375.12915 367.47202 470.72525 432.2812
 355.3149  329.38437 474.3599  534.017   446.09286 363.94113 490.73123
 399.0604  315.91876 621.1062  465.68658 418.4683  374.86212 545.50226
 561.4338  450.76468 409.21368 250.5467  436.95035 551.0455  427.13995
 462.66486 403.50046 341.65674 487.50955 417.34668 50

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['English Name'] = label_encoder.fit_transform(X['English Name'])


### **LightGBM**

In [56]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# 特徵與目標值
features = ['English Name', 'Type 1', 'Type 2', 'Legendary']
target = 'Total'

X = pokemon[features]
y = pokemon[target]

# 對 'English Name' 使用 Label Encoding
label_encoder = LabelEncoder()
X['English Name'] = label_encoder.fit_transform(X['English Name'])

# 分割訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定義資料前處理
preprocessor = ColumnTransformer(
    transformers=[
        ('type', OneHotEncoder(handle_unknown='ignore'), ['Type 1', 'Type 2']),
        ('legendary', OneHotEncoder(handle_unknown='ignore'), ['Legendary'])
    ],
    remainder='passthrough'
)

# 建立處理流程與模型
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(random_state=42))
])

# 訓練模型
pipeline.fit(X_train, y_train)

# 預測
y_pred = pipeline.predict(X_test)

# 計算均方誤差
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

# 輸出預測值
print(y_pred)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['English Name'] = label_encoder.fit_transform(X['English Name'])


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 972, number of used features: 33
[LightGBM] [Info] Start training from score 441.732510
Mean Squared Error: 9265.17
[359.02121172 498.24790689 355.21528136 457.57063264 408.8569582
 575.22427811 511.95530372 522.24851789 412.63688103 384.82854457
 381.57833874 465.30102917 453.78397545 537.13732662 351.1162439
 341.73527957 427.43343177 452.53931177 440.07084169 365.56686091
 371.00071168 569.77141558 353.05091121 481.28777008 411.85331364
 434.991009   617.23882178 374.63305228 345.69820735 367.84508347
 321.39057954 358.48680036 474.49428839 402.20580397 435.74608637
 611.19967484 587.06948691 432.03641705 516.56699542 460.66348222
 355.21528136 504.72087533 589.82180142 421.52978465 521.21556192
 450.62518747 378.42466336 537.13732662

### **Catboost**

In [59]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# 特徵與目標值
features = ['English Name', 'Type 1', 'Type 2', 'Legendary']
target = 'Total'

X = pokemon[features]
y = pokemon[target]

# 對 'English Name' 使用 Label Encoding
label_encoder = LabelEncoder()
X['English Name'] = label_encoder.fit_transform(X['English Name'])

# 分割訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定義資料前處理
preprocessor = ColumnTransformer(
    transformers=[
        ('type', OneHotEncoder(handle_unknown='ignore'), ['Type 1', 'Type 2']),
        ('legendary', OneHotEncoder(handle_unknown='ignore'), ['Legendary'])
    ],
    remainder='passthrough'
)

# 建立處理流程與模型
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', CatBoostRegressor(random_state=42, verbose=0, depth=6, learning_rate=0.03, iterations=500))
])


# 訓練模型
pipeline.fit(X_train, y_train)

# 預測
y_pred = pipeline.predict(X_test)

# 計算均方誤差
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

# 輸出預測值
print(y_pred)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['English Name'] = label_encoder.fit_transform(X['English Name'])


Mean Squared Error: 9023.96
[399.42015014 446.73596462 379.49949799 414.66467848 427.15502304
 593.12438033 424.93101705 504.68714107 470.70819619 425.04422929
 377.68538549 433.60323825 468.23022362 536.09109678 342.25069487
 376.57118284 419.65338841 449.22251217 409.65543539 380.8863956
 393.41636123 589.42788329 372.7365112  442.89492174 379.19050267
 422.84380089 611.28439831 401.05137225 374.93680934 415.44614971
 302.69084406 374.94394945 432.51486146 399.33078025 404.54815281
 536.11355143 613.60360322 415.9608405  533.15525597 422.91351807
 379.49949799 492.45017158 580.0192128  390.5199224  490.4801588
 464.00139958 379.19039668 536.09109678 424.01727723 664.00318179
 400.46152459 463.02632844 485.53046949 427.82991279 420.07368924
 461.44724798 418.06944753 426.40636506 597.27453564 383.79040382
 380.86927718 426.67359553 400.87986849 425.81329501 568.96672845
 452.71569312 417.34232826 406.11364889 371.67814348 432.38090709
 381.11231362 378.17721697 443.38181794 540.269925