In [78]:
import pandas as pd
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import resample
from sklearn.metrics import classification_report, accuracy_score

In [24]:
pokemon = pd.read_csv("/Users/michaeljeon/Desktop/INST414/Module 6 Assignment/67f1685d36ad3.csv")

clean_pokemon = pokemon.drop(['id', 'height', 'weight', 'hp', 'attack', 'defense', 's_attack', 's_defense', 'speed', 'type', 'evo_set'], axis=1).dropna()

animals = pd.read_csv("/Users/michaeljeon/Desktop/INST414/Module 6 Assignment/animal-fun-facts-dataset.csv")

clean_animal = animals.drop(['source', 'media_link', 'wikipedia_link'], axis=1).dropna().drop_duplicates(subset="animal_name", keep="first")
clean_animal = clean_animal.reset_index(drop=True)

In [25]:
pokemon_df = clean_pokemon.rename(columns={"info": "description"})
animal_df = clean_animal.rename(columns={"animal_name": "name", "text": "description"})

In [26]:
pokemon_df["label"] = 1   
animal_df["label"] = 0    

In [28]:
combined_df = pd.concat([pokemon_df, animal_df], ignore_index=True)

combined_df = combined_df.sample(frac=1).reset_index(drop=True)

print(tabulate(combined_df.head(10), headers='keys'))

    name                      description                                                                                                                                                  label
--  ------------------------  ---------------------------------------------------------------------------------------------------------------------------------------------------------  -------
 0  short-tailed fruit bat    There are over 1,000 different species of bats, and they occur on every continent except Antarctica. Only the mammal order Rodentia numbers more species.        0
 1  kangaroo rat              Merriam’s kangaroo rats literally never have to drink water                                                                                                      0
 2  croagunk                  Its cheeks hold poison sacs. It tries to catch foes off guard to jab them with toxic fingers.                                                                    1
 3  sichuan takin             They 

In [107]:
combined_df['name'] = combined_df['name'].fillna("").str.lower()
combined_df['description'] = combined_df['description'].fillna("").str.lower()

combined_df['text'] = combined_df['name'] + " " + combined_df['description']

df_majority = combined_df[combined_df['label'] == 0] 
df_minority = combined_df[combined_df['label'] == 1]  

df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

df_balanced = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1, random_state=42)

X = df_balanced['text']
y = df_balanced['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

def predict_is_pokemon(name="", description=""):
    """
    Input a name and/or description.
    Returns prediction and probability.
    """
    text = (name + " " + description).lower()
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    prob = model.predict_proba(vec)[0][1] 
    return {
        "prediction": "Pokémon" if pred == 1 else "Not Pokémon",
        "probability_pokemon": round(prob, 4)
    }

print(predict_is_pokemon("Pikachu", "Electric mouse Pokémon"))
print(predict_is_pokemon("Dog", "A domesticated four-legged animal"))
print(predict_is_pokemon("Charizard", "Fire-breathing dragon Pokémon"))
print(predict_is_pokemon("Orangutan", "Strong orange monkey"))
print(predict_is_pokemon("Mewtwo", "Psychic humanoid mammal"))

print(predict_is_pokemon("Bulbasaur", "frog with plant bulb on back"))
print(predict_is_pokemon("Tynamo", "Eel-like electric fish"))
print(predict_is_pokemon("Blastoise", "turtle-like creature with cannons on back"))
print(predict_is_pokemon("Groudon", "Fire dinosaur that causes volcanoes"))
print(predict_is_pokemon("Kitty", "viscious sharp claws"))

Accuracy: 0.9808
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       596
           1       0.97      0.99      0.98       654

    accuracy                           0.98      1250
   macro avg       0.98      0.98      0.98      1250
weighted avg       0.98      0.98      0.98      1250

{'prediction': 'Pokémon', 'probability_pokemon': np.float64(0.9959)}
{'prediction': 'Not Pokémon', 'probability_pokemon': np.float64(0.0)}
{'prediction': 'Pokémon', 'probability_pokemon': np.float64(0.9995)}
{'prediction': 'Not Pokémon', 'probability_pokemon': np.float64(0.0546)}
{'prediction': 'Pokémon', 'probability_pokemon': np.float64(0.8782)}
{'prediction': 'Not Pokémon', 'probability_pokemon': np.float64(0.1501)}
{'prediction': 'Not Pokémon', 'probability_pokemon': np.float64(0.0032)}
{'prediction': 'Not Pokémon', 'probability_pokemon': np.float64(0.384)}
{'prediction': 'Not Pokémon', 'probability_pokemon': np.float64(0.0626)}
{'prediction': 