<a href="https://colab.research.google.com/github/martinthetechie/machine_learning/blob/main/knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# K-Nearest Neighbors Implementation for Pokemon Dataset scrapeted from Serebii.net/provided by Kaggle

In [5]:
# Import Libraries
import pandas as pd
# sklearn lib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [13]:
# Read pokemon dataset
pokemon_df = pd.read_csv('pokemon.csv')
pokemon_df.columns

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')

In [14]:
# Features and target
features = ['height_m', 'weight_kg', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'experience_growth', 'base_happiness', 'capture_rate', 'type1', 'type2', 'generation', 'is_legendary']
target = 'type1'  # Example target for classification

In [15]:
X = pokemon_df[features]
y = pokemon_df[target]


In [36]:
# X.head()
# y.head()
# print(X[numeric_features].dtypes)
pokemon_df['capture_rate']
print(pokemon_df['capture_rate'].unique())
pokemon_df = pokemon_df[pokemon_df['capture_rate'] != '30 (Meteorite)255 (Core)']
pokemon_df['capture_rate'] = pokemon_df['capture_rate'].astype(float)
pokemon_df['capture_rate']


['45' '255' '120' '127' '90' '190' '75' '235' '150' '25' '170' '50' '200'
 '100' '180' '60' '225' '30' '35' '3' '65' '70' '125' '205' '155' '145'
 '130' '140' '15' '220' '160' '80' '55' '30 (Meteorite)255 (Core)']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pokemon_df['capture_rate'] = pokemon_df['capture_rate'].astype(float)


0       45.0
1       45.0
2       45.0
3       45.0
4       45.0
       ...  
796     25.0
797    255.0
798     15.0
799      3.0
800      3.0
Name: capture_rate, Length: 800, dtype: float64

In [37]:
X = pokemon_df[features]
y = pokemon_df[target]

In [38]:
# Handle missing values and encode categorical features using sklearn.Pipeline
numeric_features = ['height_m', 'weight_kg', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'experience_growth', 'base_happiness', 'capture_rate']
categorical_features = ['type1', 'type2', 'generation', 'is_legendary']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

In [39]:
# Preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', KNeighborsClassifier(n_neighbors=3))])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)


In [40]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.65


In [41]:
# Using GridSearchCv to find best params

In [42]:
# Import library
from sklearn.model_selection import GridSearchCV

In [43]:
# Define parameter grid
param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9],
    'classifier__p': [1, 2]  # p=1 for Manhattan distance, p=2 for Euclidean distance
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2f}")



Best parameters: {'classifier__n_neighbors': 9, 'classifier__p': 1}
Best cross-validation score: 0.75


In [44]:
# Predict with best parameters
y_pred = grid_search.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')

Accuracy: 0.77
