# Gemini Code

This code was generated by Google's Gemini Flash 2.0 to attempt to find a baseline to compare my results to.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np


## CAM ADDED

## Set the working directory to the location of the script
path_name = "../data/gemini-training/"

## END CAM ADDED

# Load the datasets from JSON files
df_peakWn = pd.read_json(f"{path_name}classified_data_atl_peakWn.json")
df_peakWd = pd.read_json(f"{path_name}classified_data_atl_peakWd.json")
df_oPeakWn = pd.read_json(f"{path_name}classified_data_atl_oPeakWn.json")
df_oPeakWd = pd.read_json(f"{path_name}classified_data_atl_oPeakWd.json")

# Concatenate all dataframes into one
combined_df = pd.concat([df_peakWn, df_peakWd, df_oPeakWn, df_oPeakWd], ignore_index=True)

# Define features and target
X = combined_df[['stid', 'seqNum', 'latitude', 'longitude']]
y = combined_df['behavior']

# Split the combined data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    LogisticRegression(multi_class='ovr', random_state=42, max_iter=1000),
    param_grid,
    cv=3,
    scoring='f1_weighted',
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)
y_proba = best_model.predict_proba(X_test_scaled)

# Calculate metrics
f1 = f1_score(y_test, y_pred, average='weighted')
auc_roc = roc_auc_score(y_test, y_proba, multi_class='ovr')

# Print the metrics
print("Refined Metrics for Combined Dataset:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC Score: {auc_roc:.4f}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits




Refined Metrics for Combined Dataset:
Best Parameters: {'C': 10, 'solver': 'saga'}
F1 Score: 0.4919
AUC-ROC Score: 0.7162
