In [None]:
# 1. Actual vs. Predicted Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Identity line
plt.title('Actual vs. Predicted Counts')
plt.xlabel('Actual Counts')
plt.ylabel('Predicted Counts')
plt.show()

# 2. Residuals Plot
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.title('Residuals vs. Predicted Counts')
plt.xlabel('Predicted Counts')
plt.ylabel('Residuals')
plt.hlines(0, min(y_pred), max(y_pred), colors='red')
plt.show()

# If using a Random Forest model, the third visualization would be:
feature_importance = model.feature_importances_
sorted_idx = feature_importance.argsort()
plt.barh(encoded_df.columns[sorted_idx], feature_importance[sorted_idx])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Filter to only include sightings of top N birds
top_birds = data['COMMON NAME'].value_counts().nlargest(10).index
filtered_data = data[data['COMMON NAME'].isin(top_birds)]

# Group by STATE and COMMON NAME, get count
grouped_data = filtered_data.groupby(['STATE', 'COMMON NAME']).size().reset_index(name='COUNT')

# One-hot encode the 'STATE' and 'COMMON NAME' columns
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(grouped_data[['STATE', 'COMMON NAME']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['STATE', 'COMMON NAME']))

# Split data into training and test sets
X = encoded_df
y = grouped_data['COUNT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')