# Feature Engineering

In [2]:
# Comprehensive Feature Engineering based on STATS
import pandas as pd
import numpy as np
from datetime import datetime

# Load the original dataset
df = pd.read_csv('csv/pokemon_test.csv')

# 1. Stat Ratios (ensure error handling for divide by zero)
df['Attack_Defense_Ratio'] = df['Attack'] / df['Defense'].replace(0, np.inf)
df['SpAtk_SpDef_Ratio'] = df['Sp. Atk'] / df['Sp. Def'].replace(0, np.inf)
df['Speed_Avg_Ratio'] = df['Speed'] / ((df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def']) / 5).replace(0, np.inf)

# 2. Stat Totals
df['Physical_Total'] = df['HP'] + df['Attack'] + df['Defense']
df['Special_Total'] = df['Sp. Atk'] + df['Sp. Def']
df['Offensive_Total'] = df['Attack'] + df['Sp. Atk']
df['Defensive_Total'] = df['Defense'] + df['Sp. Def']

# 3. Stat Balance
stat_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
df['Stat_StdDev'] = df[stat_columns].std(axis=1)
df['Stat_Range'] = df[stat_columns].max(axis=1) - df[stat_columns].min(axis=1)

# 4. Stat Specialization
df['Highest_Stat_Percentage'] = df[stat_columns].max(axis=1) / df[stat_columns].sum(axis=1)
df['Top_Two_Stat_Difference'] = df[stat_columns].apply(lambda x: x.nlargest(2).diff().iloc[-1], axis=1)

# 5. Stat Product
df['Physical_Bulk'] = df['HP'] * df['Defense']
df['Special_Bulk'] = df['HP'] * df['Sp. Def']

# 6. Survivability Index
df['Survivability_Index'] = (df['HP'] * df['Defense'] * df['Sp. Def']) ** (1/3)

# 7. Offensive Potential
df['Offensive_Potential'] = (df['Attack'] * df['Sp. Atk'] * df['Speed']) ** (1/3)

# 8. Size-related
df['BMI'] = df['Weight(kg)'] / (df['Height(m)'] ** 2)
df['Weight_Height_Ratio'] = df['Weight(kg)'] / df['Height(m)']

def size_class(height, weight):
    if height < 1 and weight < 20:
        return 'Small'
    elif height > 2 or weight > 100:
        return 'Large'
    else:
        return 'Medium'

df['Size_Class'] = df.apply(lambda row: size_class(row['Height(m)'], row['Weight(kg)']), axis=1)

# 9. Generation-based
generation_avg_stats = df.groupby('Generation')[stat_columns].mean().mean(axis=1)
df['Relative_Power'] = df[stat_columns].sum(axis=1) / df['Generation'].map(generation_avg_stats)

# Function to calculate type rarity
def calculate_type_rarity(row, generation_type_counts):
    gen = row['Generation']
    primary_type = row['Primary_Type']
    secondary_type = row['Secondary_Type']
    
    # Create type combination
    type_combo = f"{primary_type}-{secondary_type}" if pd.notna(secondary_type) else primary_type
    
    # Get count of this type combination in the generation
    combo_count = generation_type_counts[gen].get(type_combo, 0)
    
    # Get total number of Pokémon in this generation
    gen_total = sum(generation_type_counts[gen].values())
    
    # Calculate rarity (inverse of frequency)
    rarity = 1 - (combo_count / gen_total) if gen_total > 0 else 0
    
    return rarity

# Calculate type combination counts for each generation
generation_type_counts = {}
for gen in df['Generation'].unique():
    gen_df = df[df['Generation'] == gen]
    type_combos = gen_df.apply(lambda row: f"{row['Primary_Type']}-{row['Secondary_Type']}" 
                               if pd.notna(row['Secondary_Type']) else row['Primary_Type'], axis=1)
    generation_type_counts[gen] = type_combos.value_counts().to_dict()

# Apply the function to create the new feature
df['Type_Rarity'] = df.apply(lambda row: calculate_type_rarity(row, generation_type_counts), axis=1)

# 10. Speed Tier
def speed_tier(speed):
    if speed < 40:
        return 'Very Slow'
    elif speed < 60:
        return 'Slow'
    elif speed < 80:
        return 'Average'
    elif speed < 100:
        return 'Fast'
    else:
        return 'Very Fast'

df['Speed_Tier'] = df['Speed'].apply(speed_tier)

# Save the engineered dataset
df.to_csv('pokemon_test_engineered.csv', index=False)

print("Feature engineering complete. New dataset saved.")

Feature engineering complete. New dataset saved.


In [None]:
# Initial Feature Engineering 
import pandas as pd
import numpy as np

# Load the original data
df = pd.read_csv('csv/pokemon_train.csv')

# Create a new DataFrame for engineered features
df_engineered = df.copy()

# Feature Engineering
df_engineered['Stat_Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']
df_engineered['Physical_Ratio'] = (df['Attack'] + df['Defense']) / df_engineered['Stat_Total']
df_engineered['Special_Ratio'] = (df['Sp. Atk'] + df['Sp. Def']) / df_engineered['Stat_Total']
df_engineered['Offensive_Ratio'] = (df['Attack'] + df['Sp. Atk']) / df_engineered['Stat_Total']
df_engineered['Defensive_Ratio'] = (df['Defense'] + df['Sp. Def']) / df_engineered['Stat_Total']
df_engineered['Speed_Ratio'] = df['Speed'] / df_engineered['Stat_Total']
df_engineered['BMI'] = df['Weight(kg)'] / (df['Height(m)'] ** 2)
df_engineered['Is_Single_Type'] = (df['Secondary_Type'] == 'None').astype(int)

# One-hot encode Primary and Secondary Types
df_engineered = pd.get_dummies(df_engineered, columns=['Primary_Type', 'Secondary_Type'], prefix=['Primary', 'Secondary'])

# Create binary features for early vs late generation
df_engineered['Is_Early_Gen'] = (df_engineered['Generation'] <= 3).astype(int)

# Identify the highest stat
stat_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
df_engineered['Highest_Stat'] = df_engineered[stat_columns].idxmax(axis=1)

# Calculate stat spread (standard deviation of stats)
df_engineered['Stat_Spread'] = df_engineered[stat_columns].std(axis=1)

# Print the first few rows of the new DataFrame
print(df_engineered.head())

# Save the engineered features to a new CSV file
df_engineered.to_csv('pokemon_train_engineered.csv', index=False)

        Name  Generation  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  Total  \
0    Malamar         6.0  86      92       88       68       75     73    482   
1    Gothita         5.0  45      30       50       55       65     45    290   
2     Deoxys         3.0  50     150       50      150       50    150    600   
3  Staraptor         4.0  85     120       70       50       60    100    485   
4  Mismagius         4.0  60      60       60      105      105    105    495   

   Height(m)  ...  Secondary_Ice Secondary_Normal  Secondary_Poison  \
0        1.5  ...          False            False             False   
1        0.4  ...          False            False             False   
2        1.7  ...          False            False             False   
3        1.2  ...          False            False             False   
4        0.9  ...          False            False             False   

   Secondary_Psychic  Secondary_Rock  Secondary_Steel  Secondary_Water  \
0           