In [4]:
# editing the csv file to split types - not needed anymore 
import pandas as pd

# Read the existing CSV file
df = pd.read_csv('csv/pokemon_duplicates.csv')

# Split the 'Type' column
df[['Primary_Type', 'Secondary_Type']] = df['Type'].str.split(expand=True)

# Fill NaN values in Secondary_Type with 'None'
df['Secondary_Type'] = df['Secondary_Type'].fillna('None')

# Optional: Remove the original 'Type' column
df = df.drop('Type', axis=1)

# Reorder columns (adjust this list as needed)
column_order = ['Name', 'Primary_Type', 'Secondary_Type', 'Generation', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Total', 'Height(m)', 'Weight(kg)', 'Image URL']
df = df[column_order]

# Save to a new CSV file
df.to_csv('csv/pokemon_duplicates_updated.csv', index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tabulate import tabulate

# Load your Pokémon data
df = pd.read_csv('csv/pokemon.csv')

# Stratified 80/20 split
train, test = train_test_split(df, test_size=0.2, stratify=df['Primary_Type'], random_state=42)

# Reset indices
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# Save the splits to separate CSV files
train.to_csv('pokemon_train.csv', index=False)
test.to_csv('pokemon_test.csv', index=False)

# Create the sample size table
sample_sizes = [
    ["Total", len(df), "100.00%"],
    ["Training", len(train), f"{len(train)/len(df):.2%}"],
    ["Test", len(test), f"{len(test)/len(df):.2%}"]
]

print(tabulate(sample_sizes, headers=["Dataset", "Samples", "Percentage"], tablefmt="pipe"))

# Function to create distribution table
def create_distribution_table(feature):
    orig_dist = df[feature].value_counts(normalize=True)
    train_dist = train[feature].value_counts(normalize=True)
    test_dist = test[feature].value_counts(normalize=True)
    
    table_data = []
    for type in orig_dist.index:
        row = [
            type,
            f"{orig_dist.get(type, 0):.2%}",
            f"{train_dist.get(type, 0):.2%}",
            f"{test_dist.get(type, 0):.2%}"
        ]
        table_data.append(row)
    
    print(f"\n{feature} Distribution:")
    print(tabulate(table_data, headers=[feature, "Original", "Training", "Test"], tablefmt="pipe"))

# Create distribution tables
create_distribution_table('Primary_Type')
create_distribution_table('Secondary_Type')
create_distribution_table('Generation')

# Create maximum difference table
features = ['Primary_Type', 'Secondary_Type', 'Generation']
max_diff_data = []

for feature in features:
    orig_dist = df[feature].value_counts(normalize=True)
    train_diff = max(abs(orig_dist - train[feature].value_counts(normalize=True)))
    test_diff = max(abs(orig_dist - test[feature].value_counts(normalize=True)))
    
    max_diff_data.append([feature, f"{train_diff:.2%}", f"{test_diff:.2%}"])

print("\nMaximum Distribution Difference:")
print(tabulate(max_diff_data, headers=["Feature", "Train", "Test"], tablefmt="pipe"))

In [5]:
import pandas as pd

# check secondary types
df = pd.read_csv('csv/pokemon.csv')
print(df.isna().sum())

Name                0
Primary_Type        0
Secondary_Type    499
Generation          0
HP                  0
Attack              0
Defense             0
Sp. Atk             0
Sp. Def             0
Speed               0
Total               0
Height(m)           0
Weight(kg)          0
Image URL           0
dtype: int64


In [None]:
# get all images
import requests
import os
from PIL import Image
from io import BytesIO

def download_and_save_image(row, base_dir='pokemon_test_images'):
    name = row['Name']
    url = row['Image URL']
    gen = row['Generation']
    
    dir_path = os.path.join(base_dir, f'gen_{gen}')
    os.makedirs(dir_path, exist_ok=True)
    
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        save_path = os.path.join(dir_path, f'{name}.png')
        img.save(save_path)
        print(f"Saved: {name}")
    except Exception as e:
        print(f"Error saving {name}: {str(e)}")

# Load the Pokemon data
df = pd.read_csv('csv/pokemon_test.csv')

# Apply to your DataFrame
df.apply(download_and_save_image, axis=1)