In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your datasets

training_data_path="/content/drive/My Drive/842975_Data.csv" # import data




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Indivdual Feature Evaluation

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
# Mandatory features to include
mandatory_features = []

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Automatically determine features by excluding the target column ('Gamma'), mandatory features, and 'Depth'
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth','Hg', 'Bi','Nb','Nd', 'Pr','W','Cd','La','Sp','Ag','LE']
all_features = [col for col in training_data.columns if col not in excluded_columns]

# Target variable
y = training_data[target_column]

# Initialize the results list
results = []

# Evaluate each remaining feature with mandatory features
for feature in all_features:
    # Combine mandatory features with the current feature
    feature_set = mandatory_features + [feature]
    X = training_data[feature_set]

    # Calculate the correlation between the feature set and the target ('Gamma')
    correlation = X.corrwith(y).iloc[-1]  # Get the correlation of the last feature (the one added in this iteration)

    # Skip if the correlation is NaN
    if pd.isna(correlation):
        continue

    # Calculate R-squared and Root Mean Squared Error (RMSE)
    from sklearn.linear_model import LinearRegression
    model = LinearRegression(fit_intercept=False)
    model.fit(X, y)

    # Predict the values
    y_pred = model.predict(X)

    # Calculate R-squared (R²)
    r2 = model.score(X, y)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y, y_pred))

    # Store the feature set, correlation, R², and RMSE
    equation = " + ".join([f"({model.coef_[i]:.4f} * {feature})" for i, feature in enumerate(feature_set)])
    equation = f"Gamma = {equation}"

    results.append((feature_set, correlation, r2, rmse, equation))

# Sort results by correlation score in descending order
results.sort(key=lambda x: x[1], reverse=True)

# Display results
print("\nEquations with Each Features (Sorted by Correlation):")
for idx, (feature_set, correlation, r2, rmse, equation) in enumerate(results, start=1):
    print(f"\nEquation {idx}:")
    print(f"Features: {feature_set}")
    print(f"Correlation: {correlation:.4f}")
    print(f"R-squared: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"{equation}")


In [None]:
#List the features, correlation, R², MSE, and the equation to rate how useful they might be

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Mandatory features to include
mandatory_features = []

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Automatically determine features by excluding the target column ('Gamma'), mandatory features, and 'Depth'
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth','Hg', 'Bi','Nb','Nd', 'Pr','W','Cd','La','Sp','Ag','LE']
features = [col for col in training_data.columns if col not in excluded_columns]

# Target variable
y = training_data[target_column]

# Initialize the results list
results = []

# Evaluate each remaining feature with mandatory features
for feature in all_features:
    # Combine mandatory features with the current feature
    feature_set = mandatory_features + [feature]
    X = training_data[feature_set]

    # Split the data into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

    # Calculate the correlation between the feature set and the target ('Gamma')
    correlation = X_train.corrwith(y_train).iloc[-1]  # Get the correlation of the last feature (the one added in this iteration)

    # Skip if the correlation is NaN
    if pd.isna(correlation):
        continue

    # Train the model on the training set
    model = LinearRegression(fit_intercept=False)
    model.fit(X_train, y_train)

    # Predict the values on the test set
    y_pred = model.predict(X_test)

    # Calculate R-squared (R²) on the test set
    r2 = model.score(X_test, y_test)

    # Calculate Root Mean Squared Error (MSE)
    mse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Store the feature set, correlation, R², MSE, and the equation
    equation = " + ".join([f"({model.coef_[i]:.4f} * {feature})" for i, feature in enumerate(feature_set)])
    equation = f"Gamma = {equation}"

    results.append((feature_set, correlation, r2, mse, equation))

# Sort results by correlation score in descending order
results.sort(key=lambda x: x[1], reverse=True)

# Display results
print("\nEquations with Mandatory Features and One Additional Feature (Excluding 'Depth', Sorted by Correlation):")
for idx, (feature_set, correlation, r2, mse, equation) in enumerate(results, start=1):
    print(f"\nEquation {idx}:")
    print(f"Features: {feature_set}")
    print(f"Correlation: {correlation:.4f}")
    print(f"R-squared: {r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"{equation}")



In [None]:
#Picks the 10 most important Features by using decision tree

import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import psutil

# Function to monitor memory usage
def check_memory_limit(limit_gb):
    memory_info = psutil.virtual_memory()
    if memory_info.used > limit_gb * 1024**3:
        raise MemoryError(f"Memory usage exceeded {limit_gb} GB.")

# Set memory limit (12GB)
MEMORY_LIMIT_GB = 12
batch_size = 10000      # Smaller batches for better memory control
num_processes = 30     # Reduce to limit multiprocessing overhead

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Automatically determine features by excluding the target column ('Gamma')
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth','Hg', 'Bi','Nb','Nd', 'Pr','W','Cd','La','Sp','Ag','LE']
features = [col for col in training_data.columns if col not in excluded_columns]

# Separate features (X) and target (y)
X = training_data[features]
y = training_data[target_column]

# Standardize the features (this can help decision trees as well)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 1: Fit a Decision Tree to evaluate feature importance
tree_model = DecisionTreeRegressor(random_state=42, max_depth=None)  # No depth limitation
tree_model.fit(X_scaled, y)

# Get the feature importances from the tree model
feature_importances = pd.Series(tree_model.feature_importances_, index=features)

# Step 2: Select the top 10 most important features
top_10_features = feature_importances.nlargest(10).index.tolist()

# Print the top 10 features and their importance
print(f"Top 10 Important Features based on Decision Tree:\n{feature_importances.nlargest(10)}\n")




In [None]:
#Creates engineerd features by multplying and dividing lists them by highest correlation

import pandas as pd
from itertools import combinations

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Target column
target_column = 'Gamma'

# Columns to exclude from consideration for feature engineering
excluded_columns = [target_column, 'Well', 'Depth', 'Hg', 'Bi', 'Nb', 'Nd', 'Pr', 'W', 'Cd', 'La', 'Sp', 'Ag', 'LE']

# Filter the features, ensuring no zero variance
all_features = [
    col for col in training_data.columns
    if col not in excluded_columns and training_data[col].nunique() > 1
]

# Target variable
y = training_data[target_column]

# List to store results
results = []

# Iterate through combinations of two features
for feature1, feature2 in combinations(all_features, 2):
    # Generate engineered features by multiplication and division
    engineered_features = {}
    engineered_features[f"{feature1}*{feature2}"] = training_data[feature1] * training_data[feature2]
    # Avoid division by zero
    if (training_data[feature2] != 0).all():
        engineered_features[f"{feature1}/{feature2}"] = training_data[feature1] / training_data[feature2]
    if (training_data[feature1] != 0).all():
        engineered_features[f"{feature2}/{feature1}"] = training_data[feature2] / training_data[feature1]

    # Evaluate correlation with the target
    for feat_name, feat_values in engineered_features.items():
        try:
            correlation = feat_values.corr(y)
            if not pd.isna(correlation):  # Skip if correlation is NaN
                results.append((feat_name, correlation))
        except Exception as e:
            print(f"Error calculating correlation for {feat_name}: {e}")

# Sort results by absolute value of correlation in descending order
results.sort(key=lambda x: abs(x[1]), reverse=True)

# Display the top 100 results
if results:
    print("\nTop 100 Engineered Features with Highest Correlation to Target:")
    for idx, (feature_name, correlation) in enumerate(results[:100], start=1):
        print(f"{idx}. Feature: {feature_name}, Correlation: {correlation:.4f}")
else:
    print("\nNo significant engineered features found with non-NaN correlation.")



In [None]:
#Creates engineerd features by multplying and dividing lists them by lowest correlation

import pandas as pd
from itertools import combinations

# Load the training data
# Adjust the path as needed
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Target column
target_column = 'Gamma'

# Columns to exclude from consideration for feature engineering
excluded_columns = [target_column, 'Well', 'Depth', 'Hg', 'Bi', 'Nb', 'Nd', 'Pr', 'W', 'Cd', 'La', 'Sp', 'Ag', 'LE']

# Filter the features, ensuring no zero variance
all_features = [
    col for col in training_data.columns
    if col not in excluded_columns and training_data[col].nunique() > 1
]

# Target variable
y = training_data[target_column]

# List to store results
results = []

# Iterate through combinations of two features
for feature1, feature2 in combinations(all_features, 2):
    # Generate engineered features by multiplication and division
    engineered_features = {}
    engineered_features[f"{feature1}*{feature2}"] = training_data[feature1] * training_data[feature2]
    # Avoid division by zero
    if (training_data[feature2] != 0).all():
        engineered_features[f"{feature1}/{feature2}"] = training_data[feature1] / training_data[feature2]
    if (training_data[feature1] != 0).all():
        engineered_features[f"{feature2}/{feature1}"] = training_data[feature2] / training_data[feature1]

    # Evaluate correlation with the target
    for feat_name, feat_values in engineered_features.items():
        try:
            correlation = feat_values.corr(y)
            if not pd.isna(correlation):  # Skip if correlation is NaN
                results.append((feat_name, correlation))
        except Exception as e:
            print(f"Error calculating correlation for {feat_name}: {e}")

# Sort results by absolute value of correlation in ascending order
results.sort(key=lambda x: abs(x[1]))

# Display the top 100 least correlating features
if results:
    print("\nTop 100 Engineered Features with Lowest Correlation to Target:")
    for idx, (feature_name, correlation) in enumerate(results[:100], start=1):
        print(f"{idx}. Feature: {feature_name}, Correlation: {correlation:.4f}")
else:
    print("\nNo significant engineered features found with non-NaN correlation.")


In [None]:
#Checks every combination of feature not on the exclution list in batchs of 10000 different combinations
#Very Time comsuming and memory intensive

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from itertools import combinations
from tqdm.notebook import tqdm
import psutil

# Function to monitor memory usage
def check_memory_limit(limit_gb):
    memory_info = psutil.virtual_memory()
    if memory_info.used > limit_gb * 1024**3:
        raise MemoryError(f"Memory usage exceeded {limit_gb} GB.")

# Set memory limit (18GB)
#MEMORY_LIMIT_GB = 18

MEMORY_LIMIT_GB = 12  # Adjust as needed
batch_size = 10000      # Smaller batches for better memory control
num_processes = 30     # Reduce to limit multiprocessing overhead

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Automatically determine features by excluding the target column ('Gamma')
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth','Hg', 'Bi','Nb','Nd', 'Pr','W','Cd','La','Sp','Ag','LE']
features = [col for col in training_data.columns if col not in excluded_columns]


# Model to evaluate
model = LinearRegression()

# Initialize a list to store the top equations and their R2 scores
top_equations = []

# Generate and evaluate combinations in batches
batch_size = 10000  # Number of feature sets per batch
combination_generator = (
    combo for r in range(1, len(features) + 1) for combo in combinations(features, r)
)

# Batch processing with memory checks
batch = []
for feature_set in tqdm(combination_generator, desc="Evaluating feature sets"):
    batch.append(feature_set)
    if len(batch) >= batch_size:
        # Check memory usage
        check_memory_limit(MEMORY_LIMIT_GB)

        # Process the batch
        for feature_set in batch:
            X = training_data[list(feature_set)]
            y = training_data[target_column]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            r2 = r2_score(y_test, predictions)

            # Store the feature set and its R2 score
            top_equations.append((feature_set, r2))

        # Clear the batch
        batch = []

# Process remaining feature sets in the batch
if batch:
    for feature_set in batch:
        X = training_data[list(feature_set)]
        y = training_data[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)

        # Store the feature set and its R2 score
        top_equations.append((feature_set, r2))

# Sort the top equations by R2 score
top_equations.sort(key=lambda x: x[1], reverse=True)

# Display the top 100 equations with their R2 scores
print("\nTop 100 Equations with R2 Scores:")
for idx, (feature_set, r2) in enumerate(top_equations[:100], start=1):
    equation = " + ".join([f"({model.coef_[j]:.4f} * {feature})" for j, feature in enumerate(feature_set)])
    intercept = f"{model.intercept_:.4f}"
    print(f"Equation {idx}: R2 Score = {r2:.4f}")
    print(f"Gamma = {equation} + {intercept}\n")


In [None]:
"""Compares the Mandatory features plus 1 additional feature and produces a linear equasion with no intercept
add features you want to include into the mandatory_features list
add features you want to exclude from the all_features list"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Mandatory features to include
mandatory_features = []

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Automatically determine features by excluding the target column ('Gamma'), mandatory features, and 'Depth'
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth','Hg', 'Bi','Nb','Nd', 'Pr','W','Cd','La','Sp','Ag','LE']
all_features = [col for col in training_data.columns if col not in excluded_columns]

# Initialize the model
model = LinearRegression()

# Target variable
y = training_data[target_column]

# Evaluate each remaining feature with mandatory features
results = []

for feature in all_features:
    # Combine mandatory features with the current feature
    feature_set = mandatory_features + [feature]
    X = training_data[feature_set]

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

    # Fit the model
    model = LinearRegression(fit_intercept=False)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)

    # Store the feature set, equation, and R2 score
    coefficients = model.coef_
    intercept = model.intercept_
    equation = " + ".join([f"({coefficients[i]:.4f} * {feature})" for i, feature in enumerate(feature_set)])
    equation = f"Gamma = {equation}"
    results.append((feature_set, equation, r2))

# Sort results by R2 score
results.sort(key=lambda x: x[2], reverse=True)

# Display results
print("\nEquations with Mandatory Features and One Additional Feature (Sorted by R2):")
for idx, (feature_set, equation, r2) in enumerate(results, start=1):
    print(f"\nEquation {idx}:")
    print(f"Features: {feature_set}")
    print(f"R2 Score: {r2:.4f}")
    print(f"{equation}")


In [None]:
"""Compares the Mandatory features plus a set number of features and produces a linear equasion with an intercept

running combinations of more than 4 can greatly increase run time

add features you want to include into the mandatory_features list
add features you want to exclude from the all_features list"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from itertools import combinations

# Mandatory features to include
mandatory_features = []

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Automatically determine features by excluding the target column ('Gamma'), mandatory features, and 'Depth'
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth','Hg', 'Bi','Nb','Nd', 'Pr','W','Cd','La','Sp','Ag','LE']
features = [col for col in training_data.columns if col not in excluded_columns]

# Initialize the model with an intercept
model = LinearRegression(fit_intercept=True)

# Target variable
y = training_data[target_column]

# Evaluate each pair of remaining features with mandatory features
results = []

#Enter the number of feature combinations to check
  #more than 4 can greatly increase run time
for feature_pair in combinations(all_features, 4):
    # Combine mandatory features with the current feature pair
    feature_set = mandatory_features + list(feature_pair)
    X = training_data[feature_set]

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

    # Fit the model
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)

    # Store the feature set, equation, and R2 score
    coefficients = model.coef_
    equation = " + ".join([f"({coefficients[i]:.4f} * {feature})" for i, feature in enumerate(feature_set)])
    equation += f" + ({model.intercept_:.4f})"
    results.append((feature_set, equation, r2))

# Sort results by R2 score
results.sort(key=lambda x: x[2], reverse=True)

# Display results
print("\nEquations with Mandatory Features and Additional Features (With Intercept, Sorted by R2):")
for idx, (feature_set, equation, r2) in enumerate(results, start=1):
    print(f"\nEquation {idx}:")
    print(f"Features: {feature_set}")
    print(f"R2 Score: {r2:.4f}")
    print(f"{equation}")


In [None]:
"""Compares the Mandatory features plus a set number of features and produces a linear equasion with no intercept

running combinations of more than 4 can greatly increase run time

add features you want to include into the mandatory_features list
add features you want to exclude from the all_features list"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from itertools import combinations

# Mandatory features to include
mandatory_features = []
# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Automatically determine features by excluding the target column ('Gamma'), mandatory features, and 'Depth'
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth','Hg', 'Bi','Nb','Nd', 'Pr','W','Cd','La','Sp','Ag','LE']
features = [col for col in training_data.columns if col not in excluded_columns]

# Initialize the model without an intercept
model = LinearRegression(fit_intercept=False)

# Evaluate each pair of remaining features with mandatory features
results = []


#Enter the number of feature combinations to check
  #more than 4 can greatly increase run time
for feature_pair in combinations(all_features, 4):
    # Combine mandatory features with the current feature pair
    feature_set = mandatory_features + list(feature_pair)
    X = training_data[feature_set]

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

    # Fit the model
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)

    # Store the feature set, equation, and R2 score
    coefficients = model.coef_
    equation = " + ".join([f"({coefficients[i]:.4f} * {feature})" for i, feature in enumerate(feature_set)])
    equation = f"Gamma = {equation}"
    results.append((feature_set, equation, r2))

# Sort results by R2 score
results.sort(key=lambda x: x[2], reverse=True)

# Display results
print("\nEquations with Mandatory Features and Additional Features (No Intercept, Sorted by R2):")
for idx, (feature_set, equation, r2) in enumerate(results, start=1):
    print(f"\nEquation {idx}:")
    print(f"Features: {feature_set}")
    print(f"R2 Score: {r2:.4f}")
    print(f"{equation}")


In [None]:
"""Compares the Mandatory features plus a set number of features and produces a linear equasion with no intercept
Requested Overfitting model that uses the 100% of the dataset as training data

running combinations of more than 4 can greatly increase run time

add features you want to include into the mandatory_features list
add features you want to exclude from the all_features list"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from itertools import combinations

# Mandatory features to include
mandatory_features = []

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Automatically determine features by excluding the target column ('Gamma'), mandatory features, and 'Depth'
target_column = 'Gamma'
excluded_columns = mandatory_features + [target_column, 'Well', 'Depth', 'Hg', 'Bi', 'Nb', 'Nd', 'Pr', 'W', 'Cd', 'La', 'Sp', 'Ag', 'LE']
all_features = [col for col in training_data.columns if col not in excluded_columns]

# Initialize the model without an intercept
model = LinearRegression(fit_intercept=False)

# Evaluate each triplet of remaining features with mandatory features
results = []

for feature_triplet in combinations(all_features, 4):
    # Combine mandatory features with the current feature triplet
    feature_set = mandatory_features + list(feature_triplet)

    # Select features and target
    X = training_data[feature_set]
    y = training_data[target_column]

    # Drop rows with NaN values to ensure consistency
    combined = pd.concat([X, y], axis=1).dropna()
    X = combined[feature_set]
    y = combined[target_column]

    # Fit the model on the entire dataset
    model.fit(X, y)
    predictions = model.predict(X)
    r2 = r2_score(y, predictions)

    # Store the feature set, equation, and R2 score
    coefficients = model.coef_
    equation = " + ".join([f"({coefficients[i]:.5f} * {feature})" for i, feature in enumerate(feature_set)])
    equation = f"Gamma = {equation}"
    results.append((feature_set, equation, r2))

# Sort results by R2 score
results.sort(key=lambda x: x[2], reverse=True)

# Display only the top 100 results
print("\nTop 100 Equations with Mandatory Features and Additional Features (With Intercept, Sorted by R2):")
for idx, (feature_set, equation, r2) in enumerate(results[:100], start=1):  # Slice the top 100 results
    print(f"\nEquation {idx}:")
    print(f"Features: {feature_set}")
    print(f"R2 Score: {r2:.4f}")
    print(f"{equation}")


In [None]:
"""Compares the features in all combinations and produces a linear equasion with an intercept
it checkes in bacthes to handle large datasets without crashing

For datasets with many features it can take a long time to run

add features you want to exclude from the all_features list"""


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from itertools import combinations
from tqdm.notebook import tqdm
import psutil

# Function to monitor memory usage
def check_memory_limit(limit_gb):
    memory_info = psutil.virtual_memory()
    if memory_info.used > limit_gb * 1024**3:
        raise MemoryError(f"Memory usage exceeded {limit_gb} GB.")

# Set memory limit (18GB)
MEMORY_LIMIT_GB = 18

# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Automatically determine features by excluding the target column ('Gamma')
target_column = 'Gamma'
excluded_columns = [target_column, 'Well', 'Depth', 'Hg', 'Bi', 'Nb', 'Nd', 'Pr', 'W', 'Cd', 'La', 'Sp', 'Ag', 'LE']
features = [col for col in training_data.columns if col not in excluded_columns]

# Model to evaluate
model = LinearRegression()

# Initialize a list to store the top equations and their R2 scores
top_equations = []

# Generate and evaluate combinations in batches
batch_size = 1000  # Number of feature sets per batch
combination_generator = (
    combo for r in range(1, len(features) + 1) for combo in combinations(features, r)
)

# Batch processing with memory checks
batch = []
for feature_set in tqdm(combination_generator, desc="Evaluating feature sets"):
    batch.append(feature_set)
    if len(batch) >= batch_size:
        # Check memory usage
        check_memory_limit(MEMORY_LIMIT_GB)

        # Process the batch
        for feature_set in batch:
            X = training_data[list(feature_set)]
            y = training_data[target_column]

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            r2 = r2_score(y_test, predictions)

            # Store the feature set and its R2 score
            top_equations.append((feature_set, r2))

        # Clear the batch
        batch = []

# Process remaining feature sets in the batch
if batch:
    for feature_set in batch:
        X = training_data[list(feature_set)]
        y = training_data[target_column]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)

        # Store the feature set and its R2 score
        top_equations.append((feature_set, r2))

# Sort the top equations by R2 score
top_equations.sort(key=lambda x: x[1], reverse=True)

# Display the top 100 equations with their R2 scores
print("\nTop 100 Equations with R2 Scores:")
for idx, (feature_set, r2) in enumerate(top_equations[:100], start=1):
    equation = " + ".join([f"({model.coef_[j]:.4f} * {feature})" for j, feature in enumerate(feature_set)])
    intercept = f"{model.intercept_:.4f}"
    print(f"Equation {idx}: R2 Score = {r2:.4f}")
    print(f"Gamma = {equation} + {intercept}\n")

In [None]:
"""Runs through the set number of itterations and produces a linear equasion with an intercept

Different randam seeds will produce different results

add features you want to exclude from the all_features list"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import psutil
from tqdm.notebook import tqdm


# Load the training data
training_data = pd.read_csv(training_data_path, na_values=['#NUM!', '#DIV/0!'])

# Drop rows with NaN values
training_data.dropna(inplace=True)

# Remove 'Depth' from features
target_column = 'Gamma'
excluded_columns = [target_column, 'Well', 'Depth', 'Hg', 'Bi', 'Nb', 'Nd', 'Pr', 'W', 'Cd', 'La', 'Sp', 'Ag', 'LE']
features = [col for col in training_data.columns if col not in excluded_columns]


# Optimize data types
for col in training_data.select_dtypes(include='float64').columns:
    training_data[col] = training_data[col].astype('float32')

# Model to evaluate using SGD
model = SGDRegressor(max_iter=100000, tol=1e-3, random_state=42)

# Initialize a list to store feature sets and their R2 scores
top_equations = []

# Batch processing with memory checks
for batch_start in tqdm(range(0, len(features), batch_size), desc="Evaluating feature sets"):
    batch_end = min(batch_start + batch_size, len(features))
    batch_features = features[batch_start:batch_end]

    # Extract data for the current batch of features
    X = training_data[batch_features]
    y = training_data[target_column]

    # Scale the features to improve the model performance
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.7, random_state=42)

    # Train the model using SGD
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)

    # Store the feature set and its R2 score
    top_equations.append((batch_features, r2))

    # Check memory usage during batch processing
    check_memory_limit(MEMORY_LIMIT_GB)

# Sort the equations by R2 score in descending order
top_equations.sort(key=lambda x: x[1], reverse=True)

# Display the top 10 feature sets with their R2 scores
print("\nEquision with R2 Score:")
for idx, (feature_set, r2) in enumerate(top_equations[:1], start=1):
    equation = " + ".join([f"({model.coef_[i]:.4f} * {feature})" for i, feature in enumerate(feature_set)])
    intercept = f"{model.intercept_[0]:.4f}"  # Access the scalar value of the intercept
    print(f"Equation {idx}: R2 Score = {r2:.4f}")
    print(f"Gamma = {equation} + {intercept}\n")


