## Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = pd.read_csv('DATAPOINT.csv')
dataset.head()

In [None]:
pd.crosstab(index=dataset["class"],columns="count")

In [21]:
dataset2=dataset.drop(["Label","Vy ","Dy ","Dm ","Vu","Du "], axis=1)

In [None]:
dataset2.head()

In [23]:
dataset3=dataset2.dropna()

In [None]:
pd.crosstab(index=dataset3["class"],columns="count")

In [None]:
dataset3.describe()

## Correlation Matrix

In [None]:
corr_matrix = dataset3.corr()
corr_matrix2 = np.round(corr_matrix, 2)
plt.subplots(figsize=(12, 12))
sns.heatmap(corr_matrix2, annot=True, cmap='plasma')
plt.yticks(rotation=0)
plt.rcParams['font.family'] = 'Times New Roman'
plt.show()

In [None]:
plt.hist(dataset['ts '],bins=10,color='red')
min_val = np.min(dataset['ts '])
max_val = np.max(dataset['ts '])
mean_val = np.mean(dataset['ts '])
median_val = np.median(dataset['ts '])

plt.text(0.5, 0.9, f'Min: {min_val:.2f}', transform=plt.gca().transAxes, fontsize=12, color='black')
plt.text(0.5, 0.85, f'Max: {max_val:.2f}', transform=plt.gca().transAxes, fontsize=12, color='black')
plt.text(0.5, 0.7, f'Mean: {mean_val:.2f}', transform=plt.gca().transAxes, fontsize=12, color='black')
plt.text(0.5, 0.75, f'Median: {median_val:.2f}', transform=plt.gca().transAxes, fontsize=12, color='black')
plt.grid(alpha=0)
plt.rcParams['font.family'] = 'Times New Roman'

plt.xlabel("ts",fontsize=12)
plt.ylabel('Frequency',fontsize=12)
plt.show()

In [None]:
for column in dataset3:
    plt.figure()
    sns.boxplot(x=dataset3[column])

## Train & Test Split

In [29]:
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=40)
stratified_split = []
for train_i, test_i in sss.split(dataset3, dataset3["class"]):
  stratified_train_set_n = dataset3.iloc[train_i]
  stratified_test_set_n = dataset3.iloc[test_i]
  stratified_split.append([stratified_train_set_n, stratified_test_set_n])

In [None]:
for i, split in enumerate(stratified_split):
    train_set, test_set = split
    print(f"Train set {i+1}:")
    print(train_set)

In [None]:
train_set

In [None]:
test_set

In [None]:
test_set["class"].value_counts()/len(test_set)

In [None]:
train_set["class"].value_counts()/len(train_set)

In [None]:
train_set = train_set.drop("class", axis=1)

In [44]:
test_set = test_set.drop("class", axis=1)

In [45]:
train_features = train_set.drop("Vm ", axis=1)
train_target = train_set["Vm "]

In [46]:
test_features = test_set.drop("Vm ", axis=1)
test_target = test_set["Vm "]

In [None]:
test_target.to_frame()

## Scaling & Transformation

In [49]:
from sklearn.compose import TransformedTargetRegressor

In [51]:
MMS = MinMaxScaler()

In [54]:
train_values = MMS.fit_transform(train_features)
train_values
model = TransformedTargetRegressor(RandomForestRegressor(n_estimators=100,max_depth=None,min_samples_split=2,random_state=40),
                                   transformer=MMS)

In [55]:
model.fit(train_values,
          train_target.to_frame())

## Train Prediction

In [None]:
Pred_model = model.predict(train_values)
Pred_model

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
train_target_df = train_target.to_frame() if not isinstance(train_target, pd.DataFrame) else train_target
pred_model_df = pd.DataFrame(Pred_model, columns=['Predicted'])
results = pd.concat([train_target_df.reset_index(drop=True), pred_model_df], axis=1)
pd.set_option('display.max_rows', None)
print(results)

In [51]:
r2 = r2_score(train_target, Pred_model)
r2_scores = []
r2_scores.append(r2)

In [None]:
for i, r2 in enumerate(r2_scores):
  print(f"R^2 score for split {i}: {r2}")

In [None]:
average_r2 = np.mean(r2_scores)

print("Average R^2 Score:", average_r2)

In [54]:
MSE = mean_squared_error(train_target, Pred_model)
MAE = mean_absolute_error(train_target, Pred_model)
R2 = r2_score(train_target, Pred_model)
RMS = mean_squared_error(train_target, Pred_model, squared=False)

In [None]:
print("MSE: {:.2f}".format(MSE),"MAE: {:.2f}".format(MAE),"R2: {:.2f}".format(R2),"RMS: {:.2f}".format(RMS))

In [None]:
labels = ['RMS', 'MAE', 'R2']
values = [RMS, MAE, R2]

plt.bar(labels, values, color=['red', 'green', 'blue'])
plt.title('Model Evaluation Metrics')
plt.ylabel('Metric Value')
for i, value in enumerate(values):
 plt.text(i, value + 0.01, f'{value:.4f}', ha='center')
plt.show()

## Test Prediction

In [None]:
test_values = MMS.transform(test_features)
test_values

In [None]:
Pred_model_for_test = model.predict(test_values)
Pred_model_for_test 

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
test_target_df = test_target.to_frame() if not isinstance(test_target, pd.DataFrame) else test_target
pred_model_for_test_df = pd.DataFrame(Pred_model_for_test, columns=['Predicted'])
results = pd.concat([test_target_df.reset_index(drop=True), pred_model_for_test_df], axis=1)
pd.set_option('display.max_rows', None)

print(results)

In [60]:
MSE = mean_squared_error(test_target, Pred_model_for_test)
MAE = mean_absolute_error(test_target, Pred_model_for_test)
R2 = r2_score(test_target, Pred_model_for_test)
RMS = mean_squared_error(test_target, Pred_model_for_test, squared=False)

In [None]:
print("MAE: {:.2f}".format(MAE),
      "R2: {:.2f}".format(R2),"RMS: {:.2f}".format(RMS))

In [None]:
print("MSE: {:.2f}".format(MSE),"MAE: {:.2f}".format(MAE),"R2: {:.2f}".format(R2),"RMS: {:.2f}".format(RMS))

In [74]:
x_values = range(len(test_target))

## Graph

In [None]:
plt.plot(x_values, test_target, color='black', label='Actual', linewidth=1)
plt.plot(Pred_model_for_test, color = 'red', label = 'Predicted', linestyle = '-.', linewidth = 2)
plt.grid(alpha = 0.3)
plt.xlabel('Experiment')
plt.ylabel('Vm (kN)')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(test_target, Pred_model_for_test,label='Test set',edgecolor='orange', c= 'none')
plt.scatter(train_target, Pred_model,label='Train set', c='none',edgecolor='green')

test_target_flat = np.ravel(test_target)
train_target_flat = np.ravel(train_target)
Pred_model_for_test_flat = np.ravel(Pred_model_for_test)

p1 = max(max(Pred_model_for_test_flat), max(train_target_flat))
p2 = min(min(Pred_model_for_test_flat), min(train_target_flat))
plt.plot([p1, p2], [p1, p2], 'b--')
plt.xlabel('Measured shear strength (kN)', fontsize=10)
plt.ylabel('Predicted shear strength (kN)', fontsize=10)
plt.axis('equal')
plt.legend()
plt.show()

## Cross-Validation

In [105]:
from sklearn.model_selection import cross_val_score

In [106]:
scores = cross_val_score(model, train_values, train_target, cv=12, scoring='neg_mean_squared_error')

In [107]:
scores1 = cross_val_score(model, train_values, train_target, cv=12, scoring='neg_mean_absolute_error')

In [108]:
scores2 = cross_val_score(model, train_values, train_target, cv=12, scoring='r2')

In [None]:
print("All Scores:", scores)

In [None]:
print("All Scores:", scores1)

In [None]:
print("All Scores:", scores2)

In [None]:
print("Cross-validation Mean MSE: {:.2f}".format(-scores.mean()))

In [None]:
print("Cross-validation Mean MAE: {:.2f}".format(-scores1.mean()))

In [None]:
print("Cross-validation R2: {:.2f}".format(scores2.mean()))

## Feature Selection

In [116]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor

In [118]:
rf_regressor = RandomForestRegressor(n_estimators=100,max_depth=None,min_samples_split=2,random_state=40)

In [119]:
selector = RFECV(estimator=rf_regressor, step=1, cv=4, scoring='neg_mean_squared_error', n_jobs=-1)


In [120]:
selector.fit(train_values, train_target)

In [None]:
print("Optimal number of features selected:", selector.n_features_)

In [None]:
print("Mask of selected features:", selector.support_)

## Feature Importance

In [123]:
rf_regressor.fit(train_values, train_target)

In [124]:
train_values_df = pd.DataFrame(train_values)

columns = train_values_df.columns

In [None]:
importances = rf_regressor.feature_importances_
columns = train_values_df.columns
importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False )

plt.figure(figsize=(5.5,4))
sns.barplot(x=importance_df.index, y='Importance', data=importance_df, color= "yellow")
plt.xlabel('Feature',fontsize=18)
plt.ylabel('Importance',fontsize=18)
plt.title('Random Forest',fontsize=18)
plt.rcParams['font.family'] = 'Times New Roman'
plt.ylim(0,0.5)
for rect in plt.gca().patches:
    height = rect.get_height()
    plt.gca().annotate(f'{height:.2f}', xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 0.4),fontsize=10, textcoords="offset points", ha='center', va='bottom')
    plt.xticks(fontsize=16)  
plt.yticks(fontsize=16)  
plt.show()

In [126]:
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.datasets import load_iris

In [127]:
iris = load_iris()
X, y = train_values, train_target

In [128]:
param_grid = {
    'n_estimators': [50, 100,150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

In [129]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [130]:
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid,scoring='r2', cv=kf)

In [131]:
grid_search.fit(X, y)

In [None]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [None]:
best_score = grid_search.best_score_
print("Best Score:", best_score)