In [2]:
from scipy import stats
import pandas as pd

training_data = pd.read_csv('combined-training.csv')
test_data = pd.read_csv('combined-test.csv')

#Calculating z-scores
z_scores = stats.zscore(training_data[['x1', 'x4', 'x5', 'x6', 'x7']])

#Detecting outliers
outliers = (abs(z_scores) > 3).any(axis=1)

#Removing outliers and saving the cleaned data
cleaned_data = training_data[~outliers]



In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#Choosing the features of the training data
X_train = cleaned_data[['x1', 'x4', 'x5', 'x6', 'x7']]
Y_train = cleaned_data['BIS']

#Choosing the features of the test data
X_test = test_data[['x1', 'x4', 'x5', 'x6', 'x7']]
Y_test = test_data['BIS']

#Creating random forest model
model = RandomForestRegressor(max_depth=10, min_samples_leaf=20, random_state=42)

#Training model
model.fit(X_train, Y_train)

#Predicting BIS values
Y_pred = model.predict(X_test)

#Evaluating the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')

#feature importance
feature_importance = model.feature_importances_
print(f'Feature Importance: {feature_importance}')

R-squared: 0.8006
Mean Squared Error: 82.1019
Feature Importance: [0.10401986 0.5443496  0.06775944 0.06054052 0.22333057]


In [4]:
from scipy.stats import pearsonr

correlation = pearsonr(Y_test, Y_pred)
print(f"Pearson correlation for Random Forest Model: {correlation[0]}")

Pearson correlation for Random Forest Model: 0.8995091520254755


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

# Load the datasets (Update the file paths accordingly)
training_file_path = 'combined-training.csv'
test_file_path = 'combined-test.csv'

# Reading the datasets
training_data = pd.read_csv(training_file_path)

# Features and target variable
selected_features = ['x1', 'x4', 'x5', 'x6', 'x7']
target_variable = 'BIS'

# Clean the training data (removing outliers using Z-score method)
features_to_check = ['x1', 'x4', 'x5', 'x6', 'x7']
z_scores = stats.zscore(training_data[features_to_check])
threshold = 3
filtered_indices = (abs(z_scores) < threshold).all(axis=1)
training_data_cleaned = training_data[filtered_indices]

# Separate features and target in the training data
X_train = training_data_cleaned[selected_features]
y_train = training_data_cleaned[target_variable]

# Initialize the RandomForestRegressor model
rf_model_cv = RandomForestRegressor(random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_model_cv, X_train, y_train, cv=5, scoring='r2')

# Output cross-validation results
print("Cross-Validation R^2 Scores: ", cv_scores)
print("Mean R^2 Score: ", cv_scores.mean())
print("Standard Deviation of R^2 Scores: ", cv_scores.std())


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

# Load the datasets (Update the file paths accordingly)
training_file_path = 'combined-training.csv'
test_file_path = 'combined-test.csv'

# Reading the datasets
training_data = pd.read_csv(training_file_path)
test_data = pd.read_csv(test_file_path)

# Features and target variable
selected_features = ['x1', 'x4', 'x5', 'x6', 'x7']
target_variable = 'BIS'

# Clean the training data (removing outliers using Z-score method)
features_to_check = ['x1', 'x4', 'x5', 'x6', 'x7']
z_scores = stats.zscore(training_data[features_to_check])
threshold = 3
filtered_indices = (abs(z_scores) < threshold).all(axis=1)
training_data_cleaned = training_data[filtered_indices]

# Separate features and target in the training data
X_train = training_data_cleaned[selected_features]
y_train = training_data_cleaned[target_variable]

# Separate features and target in the test data
X_test = test_data[selected_features]
y_test = test_data[target_variable]

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, None],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}

# Initialize the RandomForestRegressor model
rf_model = RandomForestRegressor(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)

# Fit the model with the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters from Grid Search: ", best_params)

# Evaluate the tuned model on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE) on Test Data: {mse}")
print(f"R² Score on Test Data: {r2}")
