<a href="https://colab.research.google.com/github/niigoatnightcord/595_group/blob/main/Group_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# File paths
gas_file = "gas_gaf_595.csv"
stable_file = "stable_595.csv"

# Here I only select ID<5000, since there are many NAs over 5000
gas_data = pd.read_csv(gas_file)
gas_filtered = gas_data[gas_data['ID'] < 5000]

# Here I only tried 20_yr and calculate the mean, dk if makes sense
columns_of_interest = [
    'GRXX_OPM_20_yr', 'GRXX_APM_20_yr', 'GRXX_FPM_20_yr',
    'GRXX_APY_20_yr', 'GRXX_FPY_20_yr', 'GRXX_OPY_20_yr'
]
gas_selected = gas_filtered[['ID'] + columns_of_interest].dropna().copy()  # Create a new DataFrame copy

# Calculate the mean of selected columns and classify based on the mean value
gas_selected.loc[:, 'mean_value'] = gas_selected[columns_of_interest].mean(axis=1)  # Add a new column for mean

# I just set the threshold randomly, need updates
def classify(mean):
    if mean > 80:
        return 1
    elif 40 <= mean <= 80:
        return 2
    else:
        return 3

gas_selected.loc[:, 'category'] = gas_selected['mean_value'].apply(classify)  # Add a category column
labels = gas_selected[['ID', 'category']]  # Keep only ID and category for labeling

# I just select an IQ column with least NAs...
stable_data = pd.read_csv(stable_file)
stable_filtered = stable_data[stable_data['ID'] < 5000]  # Filter rows where ID < 5000
stable_selected = stable_filtered[['ID', 'fiq_strict_bartlett_24m_rescaled']].dropna().copy()  # Create a new DataFrame copy

# Attach labels to the IQs
merged_data = pd.merge(stable_selected, labels, on='ID')

# Split the data into training and testing sets 80% train set, 20% test set
X = merged_data[['fiq_strict_bartlett_24m_rescaled']]  # Features
y = merged_data['category']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000, random_state=59)
model.fit(X_train, y_train)

# Test the model and calculate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")

output_file = "classified_data.csv"
gas_selected.to_csv(output_file, index=False)
print(f"Classification data saved to {output_file}")
