# ACNH Villagers
This notebook loads the villager dataset and cleans some of the loaded data.

In [None]:
# If you need to install packages in this environment, uncomment and run:
# !pip install pandas numpy matplotlib seaborn scikit-learn scipy -q

In [None]:
# Imports
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import chi2_contingency
%matplotlib inline

## Load data
Read the CSV and show basic info.

In [None]:
data_path = '../data/acnh-data/villagers.csv'
df = pd.read_csv(data_path)
print('Shape:', df.shape)
display(df.head())
df.info()

## Basic cleaning & type normalization
Trim whitespace, make `Species` and `Personality` categorical, and check missing values. Create a small subset for fast iteration if desired.

In [None]:
# Strip whitespace from column names and string values
df.columns = df.columns.str.strip()
for c in ['Species', 'Personality']:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip()
# Convert to categorical
df['Species'] = df['Species'].astype('category')
df['Personality'] = df['Personality'].astype('category')
# Missing values
missing = df[['Species', 'Personality']].isna().sum()
print('Missing values:', missing)
# Use a working copy for fast iteration (full df by default)
df_work = df.copy()
df_work.shape