In [2]:
from scipy import stats
import pandas as pd

training_data = pd.read_csv('combined-training.csv')
test_data = pd.read_csv('combined-test.csv')

#Calculating z-scores
z_scores = stats.zscore(training_data[['x1', 'x4', 'x5', 'x6', 'x7']])

#Detecting outliers
outliers = (abs(z_scores) > 3).any(axis=1)

#Removing outliers and saving the cleaned data
cleaned_data = training_data[~outliers]



In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#Choosing the features of the training data
X_train = cleaned_data[['x1', 'x4', 'x5', 'x6', 'x7']]
Y_train = cleaned_data['BIS']

#Choosing the features of the test data
X_test = test_data[['x1', 'x4', 'x5', 'x6', 'x7']]
Y_test = test_data['BIS']

#Creating random forest model
model = RandomForestRegressor(max_depth=10, min_samples_leaf=20, random_state=42)

#Training model
model.fit(X_train, Y_train)

#Predicting BIS values
Y_pred = model.predict(X_test)

#Evaluating the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')

#feature importance
feature_importance = model.feature_importances_
print(f'Feature Importance: {feature_importance}')

R-squared: 0.8006
Mean Squared Error: 82.1019
Feature Importance: [0.10401986 0.5443496  0.06775944 0.06054052 0.22333057]
