In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import ast

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score


In [2]:
# GET DATA

def load_dataset(file_path):
    """
    Load a dataset from a CSV file into a pandas DataFrame.

    Parameters:
    - file_path (str): The file path to the CSV dataset.

    Returns:
    - pd.DataFrame: The loaded dataset as a pandas DataFrame.
    """
    try:
        data = pd.read_csv(file_path)
        return data
    except FileNotFoundError:
        print("File not found. Please provide a valid file path.")
        return None


In [3]:
file_path = "/Users/lapiscine/code/CasparRitchie/casparcatchemall/pokemon.csv"
data = load_dataset(file_path)
data


Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,['Beast Boost'],0.25,1.0,0.5,2.0,0.5,1.0,2.0,0.5,1.0,...,,797,107,101,61,steel,flying,999.9,7,1
796,['Beast Boost'],1.00,1.0,0.5,0.5,0.5,2.0,4.0,1.0,1.0,...,,798,59,31,109,grass,steel,0.1,7,1
797,['Beast Boost'],2.00,0.5,2.0,0.5,4.0,2.0,0.5,1.0,0.5,...,,799,97,53,43,dark,dragon,888.0,7,1
798,['Prism Armor'],2.00,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,...,,800,127,89,79,psychic,,230.0,7,1


In [4]:
def preprocess_pokemon_data(data):
    # Check initial shape
    print("Initial Shape:", data.shape)

    # Fill missing values
    if 'height_m' in data.columns:
        data['height_m'].fillna(data['height_m'].median(), inplace=True)
    if 'weight_kg' in data.columns:
        data['weight_kg'].fillna(data['weight_kg'].median(), inplace=True)

    # Drop columns
    columns_to_drop = ['percentage_male']
    data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])
    print("After dropping columns:", data.shape)
    print("After dropping columns:", data.head())

        # Convert 'capture_rate' to numeric and create 'catchability'
    data['capture_rate'] = pd.to_numeric(data['capture_rate'], errors='coerce')
    data['catchability'] = data['capture_rate'] / 2.55
    data = data.dropna(subset=['catchability'])
    print("After capture rate conversion to catchability:", data.shape)

    data = data.drop(columns=['capture_rate'])
    # Check shape after dropping columns
    print("Shape after dropping capture rate:", data.shape)
    print(" Head after dropping capture rate:", data.head())

    # Handle 'type2' and create 'combined_type'
    data['type2'].fillna('None', inplace=True)
    data['combined_type'] = data['type1'] + "_" + data['type2']

    # Check shape after creating combined type
    print("Shape after combined type:", data.shape)
    print("Head after combined type:", data.head)

    # Handle 'abilities'
    if 'abilities' in data.columns:
        # Convert string representation of list to actual list
        data['abilities'] = data['abilities'].apply(ast.literal_eval)

        # Collect all unique abilities from the dataset
        all_abilities = set().union(*data['abilities'])

        # Prepare data for new DataFrame
        abilities_dicts = []
        for index, row in data.iterrows():
            abilities_dict = {ability: int(ability in row['abilities']) for ability in all_abilities}
            abilities_dicts.append(abilities_dict)

        # Create a DataFrame from list of dictionaries
        abilities_data = pd.DataFrame(abilities_dicts, index=data.index)

        # Concatenate the abilities data
        data = pd.concat([data, abilities_data], axis=1)
        data.drop(columns=['abilities'], inplace=True)
    else:
        print("'abilities' column is missing")
    print("Shape after abilities:", data.shape)
    print("Head after abilities:", data.head)

    # One-hot encoding for 'combined_type' using pd.get_dummies
    data = pd.get_dummies(data, columns=['combined_type'])
    print("Shape after get dummies on combined type:", data.shape)
    print("Head after get dummies on combined type:", data.head)

    # One-hot encoding 'classfication' misspelt field and dropping original columns
    data = pd.get_dummies(data, columns=['classfication'])
    print("Shape after get dummies on classfication:", data.shape)
    print("Head after get dummies on classfication:", data.head)

    data.drop(columns=['japanese_name', 'name', 'base_total', 'type1', 'type2'], inplace=True)
    print("Shape after drop final columns:", data.shape)
    print("Head after drop final columns:", data.head)

    # # Scale numerical columns
    # numerical_cols = data.select_dtypes(include=[np.number])
    # scaler = StandardScaler()
    # data[numerical_cols.columns] = scaler.fit_transform(numerical_cols)
    # print("Shape after scaler:", data.shape)
    # print("Head after scaler:", data.head)

    return data


In [5]:
data


Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,['Beast Boost'],0.25,1.0,0.5,2.0,0.5,1.0,2.0,0.5,1.0,...,,797,107,101,61,steel,flying,999.9,7,1
796,['Beast Boost'],1.00,1.0,0.5,0.5,0.5,2.0,4.0,1.0,1.0,...,,798,59,31,109,grass,steel,0.1,7,1
797,['Beast Boost'],2.00,0.5,2.0,0.5,4.0,2.0,0.5,1.0,0.5,...,,799,97,53,43,dark,dragon,888.0,7,1
798,['Prism Armor'],2.00,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,...,,800,127,89,79,psychic,,230.0,7,1


In [6]:
def scale_numerical_cols(data, fitted_scaler=None):
    numerical_cols = data.select_dtypes(include=[np.number])
    if fitted_scaler:
        # Use the pre-fitted scaler
        data[numerical_cols.columns] = fitted_scaler.transform(numerical_cols)
    else:
        # Fit a new scaler if no pre-fitted scaler is provided
        fitted_scaler = StandardScaler()
        data[numerical_cols.columns] = fitted_scaler.fit_transform(numerical_cols)

    return data, fitted_scaler


In [7]:
# Make sure combined_type_encoder is defined and fitted before this call
processed_data = preprocess_pokemon_data(data)
processed_data, fitted_scaler = scale_numerical_cols(processed_data)
processed_data


Initial Shape: (800, 41)
After dropping columns: (800, 40)
After dropping columns:                      abilities  against_bug  against_dark  against_dragon  \
0  ['Overgrow', 'Chlorophyll']          1.0           1.0             1.0   
1  ['Overgrow', 'Chlorophyll']          1.0           1.0             1.0   
2  ['Overgrow', 'Chlorophyll']          1.0           1.0             1.0   
3     ['Blaze', 'Solar Power']          0.5           1.0             1.0   
4     ['Blaze', 'Solar Power']          0.5           1.0             1.0   

   against_electric  against_fairy  against_fight  against_fire  \
0               0.5            0.5            0.5           2.0   
1               0.5            0.5            0.5           2.0   
2               0.5            0.5            0.5           2.0   
3               1.0            0.5            1.0           0.5   
4               1.0            0.5            1.0           0.5   

   against_flying  against_ghost  ...        name  

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,classfication_Wish Pokémon,classfication_Wolf Pokémon,classfication_Wood Gecko Pokémon,classfication_Woodpecker Pokémon,classfication_Wool Pokémon,classfication_Woolly Crab Pokémon,classfication_Worm Pokémon,classfication_Wrestling Pokémon,classfication_Young Fowl Pokémon,classfication_Zen Charm Pokémon
0,0.005235,-0.130524,0.088513,-0.875670,-1.089822,-0.788606,1.249119,1.334872,0.026869,-0.993874,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
1,0.005235,-0.130524,0.088513,-0.875670,-1.089822,-0.788606,1.249119,1.334872,0.026869,-0.993874,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
2,0.005235,-0.130524,0.088513,-0.875670,-1.089822,-0.788606,1.249119,1.334872,0.026869,-0.993874,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
3,-0.832301,-0.130524,0.088513,-0.111310,-1.089822,-0.091496,-0.920118,-0.320783,0.026869,-0.676975,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
4,-0.832301,-0.130524,0.088513,-0.111310,-1.089822,-0.091496,-0.920118,-0.320783,0.026869,-0.676975,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-1.251069,-0.130524,-1.327694,1.417411,-1.089822,-0.091496,1.249119,-1.148611,0.026869,-0.993874,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
796,0.005235,-0.130524,-1.327694,-0.875670,-1.089822,1.302724,4.141435,-0.320783,0.026869,-0.993874,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
797,1.680306,-1.271720,2.920926,-0.875670,5.613090,1.302724,-0.920118,-0.320783,-0.868777,-0.676975,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377
798,1.680306,2.151867,0.088513,-0.111310,-0.132263,-0.788606,-0.197039,-0.320783,1.818162,-0.043177,...,-0.035377,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.050063,-0.035377,-0.035377,-0.035377


In [8]:
# import numpy as np
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.metrics import r2_score

# def train_evaluate_stacked_models(processed_data, test_size=0.2, random_state=42):
#     # Align the processed data with the original data indices
#     data = data.loc[processed_data.index]

#     # Define the target variable 'y' and features 'X'
#     y = data['catchability']
#     X = processed_data.drop('catchability', axis=1)  # Features

#     # Splitting the dataset into training and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

#     # Initialize base models
#     base_model1 = RandomForestRegressor(random_state=random_state)
#     base_model2 = GradientBoostingRegressor(random_state=random_state)

#     # Train base models
#     base_model1.fit(X_train, y_train)
#     base_model2.fit(X_train, y_train)

#     # Make predictions using base models
#     pred1 = base_model1.predict(X_test)
#     pred2 = base_model2.predict(X_test)

#     # Create a new feature matrix using base model predictions
#     X_stacked = np.column_stack((pred1, pred2))

#     # Initialize and train the meta-model (Linear Regression)
#     meta_model = LinearRegression()
#     meta_model.fit(X_stacked, y_test)

#     # Make predictions and evaluate the stacked model
#     stacked_predictions = meta_model.predict(X_stacked)
#     stacked_score = r2_score(y_test, stacked_predictions)
#     print("Stacked Model Score on Test Data:", stacked_score)

#     # K-Fold Cross Validation for Linear Regression Meta-Model
#     scores = cross_val_score(meta_model, X, y, cv=5)
#     print("Linear Regression Cross-Validation Scores:", scores)

#     # Initialize and train the Ridge meta-model
#     ridge_meta_model = Ridge(alpha=1.0)
#     ridge_meta_model.fit(X_stacked, y_test)

#     # Make predictions and evaluate the Ridge stacked model
#     ridge_stacked_predictions = ridge_meta_model.predict(X_stacked)
#     ridge_stacked_score = r2_score(y_test, ridge_stacked_predictions)
#     print("Ridge Stacked Model Score on Test Data:", ridge_stacked_score)

#     # K-Fold Cross Validation for Ridge Meta-Model
#     ridge_scores = cross_val_score(ridge_meta_model, X, y, cv=5)
#     print("Ridge Cross-Validation Scores:", ridge_scores)

#     # Returning models for further use if needed
#     return base_model1, base_model2, meta_model, ridge_meta_model


In [9]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

def train_evaluate_enhanced_models(processed_data, test_size=0.2, random_state=42):
    # Define the target variable 'y' and features 'X'
    y = processed_data['catchability']
    X = processed_data.drop('catchability', axis=1)  # Features

    # Fit the scaler only on feature columns
    scaler = StandardScaler().fit(X)

    # Transform the training data
    X_scaled = scaler.transform(X)

    # Splitting the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=random_state)

    # Define base models for stacking
    estimators = [
        ('rf', RandomForestRegressor(random_state=random_state)),
        ('gb', GradientBoostingRegressor(random_state=random_state)),
        ('svr', SVR()),  # Support Vector Regressor
        ('knn', KNeighborsRegressor())  # K-Neighbors Regressor
    ]

    # Initialize Stacking Regressor with a meta-regressor
    stack_reg = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1.0))

    # Fit the model
    stack_reg.fit(X_train, y_train)

    # Make predictions and evaluate
    predictions = stack_reg.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("Evaluation Metrics:")
    print("Mean Squared Error:", mse)
    print("Mean Absolute Error:", mae)
    print("R-squared Score:", r2)

    # Optional: Grid Search for Hyperparameter Tuning
    # parameters = {'rf__n_estimators': [100, 200], 'gb__n_estimators': [100, 200]}
    # grid_search = GridSearchCV(stack_reg, param_grid=parameters, cv=5)
    # grid_search.fit(X_train, y_train)
    # best_params = grid_search.best_params_
    # print("Best Parameters:", best_params)

    # Returning the trained model
    return stack_reg, scaler

trained_model, fitted_scaler = train_evaluate_enhanced_models(processed_data)


Evaluation Metrics:
Mean Squared Error: 0.31586637837607695
Mean Absolute Error: 0.38036539258990054
R-squared Score: 0.6901951673013773


In [10]:
# def preprocess_pokemon_data(data, original_encoder):
#     # Check initial shape
#     print("Initial Shape:", data.shape)

#     # Fill missing values
#     if 'height_m' in data.columns:
#         data['height_m'].fillna(data['height_m'].median(), inplace=True)
#     if 'weight_kg' in data.columns:
#         data['weight_kg'].fillna(data['weight_kg'].median(), inplace=True)

#     # Drop columns
#     columns_to_drop = ['percentage_male']
#     data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])

#     # Check shape after dropping columns
#     print("Shape after dropping columns:", data.shape)

#     # Combined Type
#     if 'type1' in data.columns and 'type2' in data.columns:
#         data['type2'].fillna('None', inplace=True)
#         data['combined_type'] = data['type1'] + "_" + data['type2']
#     else:
#         print("Type1 and/or Type2 columns are missing")

#     # Check shape after creating combined type
#     print("Shape after combined type:", data.shape)

#     # One-hot encoding for 'combined_type'
#     if 'combined_type' in data.columns:
#         combined_type_encoded = original_encoder.transform(data[['combined_type']])
#         encoded_columns = pd.DataFrame(combined_type_encoded.toarray(),
#                                        columns=original_encoder.get_feature_names_out(['combined_type']))
#         data = pd.concat([data, encoded_columns], axis=1)
#         data.drop(columns=['combined_type'], inplace=True)
#     else:
#         print("'combined_type' column is missing")


In [11]:
trained_model


In [12]:
new_pokemon_data = pd.read_csv('/Users/lapiscine/code/mtthibault/catchemall/notebooks/pokemon-prediction.csv')
new_pokemon_data.head()


Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,['Shields Down'],0,0,0,0,0,0,0,0,0,...,,774,1000,1000,1000,grass,water,4.0,7,0


In [13]:
# Preprocess the new prediction data
processed_new_pokemon_data = preprocess_pokemon_data(new_pokemon_data)
print(processed_new_pokemon_data.head())
# Add missing columns in the prediction data and fill them with zeros
# Create a DataFrame with zeros for all the missing columns
missing_cols = set(processed_data.columns) - set(processed_new_pokemon_data.columns)
missing_cols_data = pd.DataFrame(0, index=processed_new_pokemon_data.index, columns=missing_cols)

# Concatenate this new DataFrame with the original DataFrame
processed_new_pokemon_data = pd.concat([processed_new_pokemon_data, missing_cols_data], axis=1)

# Drop any columns in the prediction data that are not in the training data
extra_cols = set(processed_new_pokemon_data.columns) - set(processed_data.columns)
processed_new_pokemon_data = processed_new_pokemon_data.drop(columns=extra_cols)

# Ensure the order of the columns in the prediction data matches the training data
processed_new_pokemon_data = processed_new_pokemon_data[processed_data.columns]

print(processed_new_pokemon_data.head())


Initial Shape: (1, 41)
After dropping columns: (1, 40)
After dropping columns:           abilities  against_bug  against_dark  against_dragon  \
0  ['Shields Down']            0             0               0   

   against_electric  against_fairy  against_fight  against_fire  \
0                 0              0              0             0   

   against_flying  against_ghost  ...            name  pokedex_number  \
0               0              0  ...  MiniorCASPICHU             774   

   sp_attack  sp_defense  speed  type1  type2  weight_kg  generation  \
0       1000        1000   1000  grass  water        4.0           7   

   is_legendary  
0             0  

[1 rows x 40 columns]
After capture rate conversion to catchability: (1, 41)
Shape after dropping capture rate: (1, 40)
 Head after dropping capture rate:           abilities  against_bug  against_dark  against_dragon  \
0  ['Shields Down']            0             0               0   

   against_electric  against_fairy  

In [14]:
# Drop 'catchability' from the new prediction data if it's present
if 'catchability' in processed_new_pokemon_data.columns:
    processed_new_pokemon_data = processed_new_pokemon_data.drop(columns=['catchability'])

# Add missing columns in the prediction data and fill them with zeros
missing_cols = set(processed_data.drop(columns=['catchability']).columns) - set(processed_new_pokemon_data.columns)
for col in missing_cols:
    processed_new_pokemon_data[col] = 0

# Drop any columns in the prediction data that are not in the training data
extra_cols = set(processed_new_pokemon_data.columns) - set(processed_data.drop(columns=['catchability']).columns)
processed_new_pokemon_data = processed_new_pokemon_data.drop(columns=extra_cols)

# Ensure the order of the columns in the prediction data matches the training data
processed_new_pokemon_data = processed_new_pokemon_data[processed_data.drop(columns=['catchability']).columns]

# Now apply the scaling
processed_new_pokemon_data_scaled = fitted_scaler.transform(processed_new_pokemon_data)

# Predict using the trained model
catchability_prediction = trained_model.predict(processed_new_pokemon_data_scaled)

# Output the prediction
print("Predicted Catchability:", catchability_prediction)


Predicted Catchability: [-0.91009309]


In [15]:
rescaled_prediction = (catchability_prediction + 1) * 255

# Output the rescaled prediction
print("Rescaled Predicted Catchability:", rescaled_prediction)


Rescaled Predicted Catchability: [22.92626269]


# Create some fake Pokemons for testing