<a href="https://colab.research.google.com/github/niigoatnightcord/595_group_project_4/blob/main/Group_Project_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# File paths
gas_file = "gas_gaf_595.csv"
stable_file = "stable_595.csv"

# Here I only select ID<5000, since there are many NAs over 5000
gas_data = pd.read_csv(gas_file)
gas_filtered = gas_data[gas_data['ID'] < 5000]

# Here I only tried 20_yr and calculate the mean, dk if makes sense
columns_of_interest = ['GRXX_OPY_20_yr']
gas_selected = gas_filtered[['ID'] + columns_of_interest].dropna().copy()  # Create a new DataFrame copy

# Calculate the mean of selected columns and classify based on the mean value
gas_selected.loc[:, 'mean_value'] = gas_selected[columns_of_interest].mean(axis=1)  # Add a new column for mean

# I just set the threshold randomly, need updates
def classify(mean):
    if mean > 61:
        return 1
    elif 31 <= mean <= 61:
        return 2
    else:
        return 3

gas_selected.loc[:, 'category'] = gas_selected['mean_value'].apply(classify)  # Add a category column
labels = gas_selected[['ID', 'category']]  # Keep only ID and category for labeling

# I just select an IQ column with least NAs...
stable_data = pd.read_csv(stable_file)
stable_filtered = stable_data[stable_data['ID'] < 5000]  # Filter rows where ID < 5000
stable_selected = stable_filtered[['ID', 'fiq_strict_bartlett_24m_rescaled']].dropna().copy()  # Create a new DataFrame copy

# Attach labels to the IQs
merged_data = pd.merge(stable_selected, labels, on='ID')

# Split the data into training and testing sets 80% train set, 20% test set
X = merged_data[['fiq_strict_bartlett_24m_rescaled']]  # Features
y = merged_data['category']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000, random_state=59)
model.fit(X_train, y_train)

# Test the model and calculate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")

output_file = "classified_data.csv"
gas_selected.to_csv(output_file, index=False)
print(f"Classification data saved to {output_file}")


In [28]:
import pandas as pd

# load dataset
df = pd.read_csv('stable_merged.csv')

#clean
df_cleaned = df[['fiq_strict_bartlett_20y_rescaled', 'GRXX_OPY_20_yr']].dropna()

# calculate correaltion
correlation = df_cleaned['fiq_strict_bartlett_20y_rescaled'].corr(df_cleaned['GRXX_OPY_20_yr'])

print(f"The correlation between 'fiq_strict_bartlett_20y_rescaled' and 'GRXX_OPY_20_yr' is: {correlation}")


The correlation between 'fiq_strict_bartlett_20y_rescaled' and 'GRXX_OPY_20_yr' is: 0.4890066884539084


In [34]:
import pandas as pd
from scipy.stats import pearsonr

# Load the dataset
merged_file = "stable_merged.csv"
merged_data = pd.read_csv(merged_file)

# Ensure there are no missing values in the columns of interest
merged_data = merged_data.dropna(subset=['fiq_strict_bartlett_20y_rescaled', 'GRXX_OPY_20_yr'])

# Calculate the Pearson correlation coefficient
corr, p_value = pearsonr(merged_data['fiq_strict_bartlett_20y_rescaled'], merged_data['GRXX_OPY_20_yr'])

# Output the result
print(f"Pearson correlation coefficient: {corr:.2f}")
print(f"P-value: {p_value:.2e}")


Pearson correlation coefficient: 0.49
P-value: 6.83e-16


In [2]:
import pandas as pd

# Load the dataset
file_path = "gas_gaf_595.csv"
data = pd.read_csv(file_path)

# Count non-NA values for each column
non_na_counts = data.count()

# Print the results
print("Number of non-NA values for each column:")
print(non_na_counts)


Number of non-NA values for each column:
Unnamed: 0         1039
ID                 1039
GFX_LOW_10yr        460
GFX_HIGH_10yr       461
GRXX_APM_20_yr      696
GRXX_FPM_20_yr      697
GRXX_OPM_20_yr      696
GRXX_APY_20_yr      686
GRXX_FPY_20_yr      703
GRXX_OPY_20_yr      697
GRXXV_APM_25_yr     539
GRXXV_FPM_25_yr     546
GRXXV_OPM_25_yr     547
GRXXV_APY_25_yr     534
GRXXV_FPY_25_yr     541
GRXXV_OPY_25_yr     541
GR27_APM_27_yr      494
GR27_FPM_27_yr      494
GR27_APY_27_yr      494
GR27_FPY_27_yr      493
dtype: int64


In [18]:
import pandas as pd

# Load the datasets
gas_gaf_file = "gas_gaf_595.csv"
stable_file = "stable_595.csv"

gas_gaf_data = pd.read_csv(gas_gaf_file)
stable_data = pd.read_csv(stable_file)

# Extract the "GRXX_OPY_20_yr" column
grxx_opy_column = gas_gaf_data[["ID", "GRXX_OPY_20_yr", "GRXXV_OPY_25_yr"]]  # Ensure ID is included for merging

# Merge with stable data on the ID column
merged_data = pd.merge(stable_data, grxx_opy_column, on="ID", how="left")

# Save the merged dataset to a new CSV file
output_file = "stable_merged.csv"
merged_data.to_csv(output_file, index=False)

print(f"Merged dataset saved to {output_file}")


Merged dataset saved to stable_merged.csv


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# File path for the merged dataset
merged_file = "stable_merged.csv"

# Load the merged dataset
merged_data = pd.read_csv(merged_file)

# Filter rows with ID < 5000 and drop rows with NA values in the relevant columns
filtered_data = merged_data[merged_data['ID'] < 5000].dropna(subset=['GRXX_OPY_20_yr', 'fiq_strict_bartlett_20y_rescaled'])

# Classify based on GRXX_OPY_20_yr value
def classify(value):
    if value > 60:
        return 1
    elif 31 <= value <= 60:
        return 2
    else:
        return 3

# Add the classification column
filtered_data['category'] = filtered_data['GRXX_OPY_20_yr'].apply(classify)

# Rename fiq_strict_bartlett_20y_rescaled to IQ
filtered_data.rename(columns={'fiq_strict_bartlett_20y_rescaled': 'IQ'}, inplace=True)

# Split the data into features (X) and labels (y)
X = filtered_data[['IQ']]  # Features
y = filtered_data['category']  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000, random_state=59)
model.fit(X_train, y_train)




In [31]:
# Test the model and calculate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")

# Save the processed data with classifications
output_file = "stable_classified.csv"
filtered_data.to_csv(output_file, index=False)
print(f"Processed data with classifications saved to {output_file}")




cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)




report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)



Model accuracy: 0.71
Processed data with classifications saved to stable_classified.csv
Confusion Matrix:
[[ 3  5  0]
 [ 4 31  0]
 [ 0  5  1]]
Classification Report:
              precision    recall  f1-score   support

           1       0.43      0.38      0.40         8
           2       0.76      0.89      0.82        35
           3       1.00      0.17      0.29         6

    accuracy                           0.71        49
   macro avg       0.73      0.48      0.50        49
weighted avg       0.73      0.71      0.68        49



In [33]:
# Test set 2: Full IQ (from fiq_strict_bartlett_25y_rescaled) with labels from GRXXV_OPY_25_yr
# Drop rows with missing values in fiq_strict_bartlett_25y_rescaled or GRXXV_OPY_25_yr
test_25_data = merged_data.dropna(subset=['fiq_strict_bartlett_25y_rescaled', 'GRXXV_OPY_25_yr']).copy()

# Create test labels based on GRXXV_OPY_25_yr
test_25_data['category'] = test_25_data['GRXXV_OPY_25_yr'].apply(classify)

# Rename fiq_strict_bartlett_25y_rescaled to IQ for consistency
test_25_data.rename(columns={'fiq_strict_bartlett_25y_rescaled': 'IQ'}, inplace=True)

# Features and labels for the second test set
X_test_25 = test_25_data[['IQ']]
y_test_25 = test_25_data['category']

# Test the model on the second test set
y_pred_25 = model.predict(X_test_25)
accuracy_25 = accuracy_score(y_test_25, y_pred_25)
print(f"Model accuracy on the 100% test set (IQ from fiq_strict_bartlett_25y_rescaled): {accuracy_25:.2f}")




cm_25 = confusion_matrix(y_test_25, y_pred_25)
print("Confusion Matrix:")
print(cm_25)


report = classification_report(y_test_25, y_pred_25)
print("Classification Report:")
print(report)


Model accuracy on the 100% test set (IQ from fiq_strict_bartlett_25y_rescaled): 0.63
Confusion Matrix:
[[ 12  32   0]
 [  4 128   3]
 [  1  43   3]]
Classification Report:
              precision    recall  f1-score   support

           1       0.71      0.27      0.39        44
           2       0.63      0.95      0.76       135
           3       0.50      0.06      0.11        47

    accuracy                           0.63       226
   macro avg       0.61      0.43      0.42       226
weighted avg       0.62      0.63      0.55       226

