In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datacenterstats/updated_stats_file.xlsx


In [104]:
import pandas as pd
from scipy.stats import pearsonr

# Load the data
df = pd.read_excel("/kaggle/input/datacenterstats/updated_stats_file.xlsx")  # Replace with your file path
df = df.dropna()

# Define the independent variables (X) and dependent variable (TTM PUE)
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]  # This is the PUS column

# Calculate Pearson's correlation and p-value for each feature in X
for col in X.columns:
    corr, p_value = pearsonr(X[col], y)
    print(f"Correlation between {col} and TTM PUE: {corr:.16f}, p-value: {p_value:.16f}")


Correlation between Altitude m and TTM PUE: 0.1718211737556656, p-value: 0.0000409561099715
Correlation between Average Temperature C and TTM PUE: 0.3293317370857915, p-value: 0.0000000000000010
Correlation between Average Humidity and TTM PUE: 0.1097917055692047, p-value: 0.0090670213141021


In [117]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, randint
import numpy as np

# Load the data
df = pd.read_excel("/kaggle/input/datacenterstats/updated_stats_file.xlsx")  # Replace with your file path
df = df.dropna()

# Load your data (example)
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = GradientBoostingRegressor(random_state=42)

# Define the hyperparameters to search over
param_dist = {
    'n_estimators': randint(1, 5000),  # Number of boosting stages to be run
    'learning_rate': uniform(0.01, 100),  # Step size for each boosting stage
    'max_depth': randint(1, 500),  # Maximum depth of the individual trees
    'min_samples_split': randint(1, 300),  # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 300),  # Minimum samples required to be at a leaf node
    'subsample': uniform(0.5, 0.5),  # Fraction of samples used for fitting each tree
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    model, 
    param_distributions=param_dist, 
    n_iter=50,  # Number of random combinations to try
    scoring='neg_mean_squared_error',  # Use negative MSE for evaluation
    cv=5,  # 5-fold cross-validation
    random_state=42,
    n_jobs=-1,  # Use all processors for faster computation
    verbose=1
)

random_search.fit(X_train, y_train)

# Get the best model from the random search
best_model = random_search.best_estimator_

# Print the best hyperparameters
print("Best hyperparameters found by RandomizedSearchCV:", random_search.best_params_)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits


             nan             nan             nan             nan
             nan             nan             nan             nan
             nan -5.75029424e-04             nan            -inf
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan]
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


Best hyperparameters found by RandomizedSearchCV: {'learning_rate': 1.5736406741193931, 'max_depth': 9, 'min_samples_leaf': 129, 'min_samples_split': 136, 'n_estimators': 1496, 'subsample': 0.5070399113575422}
Mean Squared Error: 0.0008439321054541381
R² Score: -0.003367695953810035


In [401]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.graph_objects as go
from datetime import datetime

current_year = datetime.now().year
# Create the annotation text with the copyright symbol
copyright_text = f"<b>© {current_year} NEB Synergy</b>"

# Assuming df is your dataframe
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]
df = df.dropna()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred_precursor = model.predict(X_test)
y_pred = np.round(y_pred_precursor, 2)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
residuals = y_test - y_pred
# Calculate the standard deviation of the residuals
std_dev_residuals = np.std(residuals)
print(f"Standard Deviation of Residuals (Actual - Predicted PUE): {std_dev_residuals}")

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


# Create 3D plot for actual values only (without predicted values)
fig = go.Figure()

# Add actual values with spheres
fig.add_trace(go.Scatter3d(
    x=X_test["Altitude m"],
    y=X_test["Average Temperature C"],
    z=X_test["Average Humidity"],
    mode='markers',
    marker=dict(
        size=8,  # Size of the spheres
        color=y_test,  # Color by actual PUE values
        colorscale='viridis',  # Colormap for actual PUE
        symbol='circle',  # Sphere marker
        opacity=0.7
    ),
    text=[f'Actual TTM PUE: {val}<br>Predicted TTM PUE: {pred:.2f}' for val, pred in zip(y_test, y_pred)],  # Display actual and predicted PUE values on hover
    hoverinfo='text',  # Show the 'text' when hovering
    name='All Test Points'
))

# Add points where predicted PUE is within 1 standard deviation of actual PUE
within_std_dev = np.abs(y_pred - y_test) <= std_dev_residuals
# Calculate the number of points where predicted PUE is within the standard deviation
correct_within_std_dev = np.sum(within_std_dev)
# Calculate the percentage of points within the standard deviation
percentage_correct_within_std_dev = (correct_within_std_dev / len(y_test)) * 100
# Print the result
print(f"Percentage of predicted points within 1 standard deviation of actual PUE: {percentage_correct_within_std_dev:.2f}%")

fig.add_trace(go.Scatter3d(
    x=X_test["Altitude m"][within_std_dev],
    y=X_test["Average Temperature C"][within_std_dev],
    z=X_test["Average Humidity"][within_std_dev],
    mode='markers',
    marker=dict(
        size=8,  # Size of the spheres
        color=y_pred[within_std_dev],  # Color by predicted PUE values
        colorscale='blues',  # Different color scale for points within std deviation
        symbol='circle',  # Sphere marker
        opacity=0.7
    ),
    text=[f'Actual TTM PUE: {val}<br>Predicted TTM PUE: {pred:.2f}' for val, pred in zip(y_test[within_std_dev], y_pred[within_std_dev])],  # Display actual and predicted PUE values on hover
    hoverinfo='text',  # Show the 'text' when hovering
    name='Correct Predictions'
))

# Add points where predicted PUE is outside 1 standard deviation of actual PUE
outside_std_dev = np.abs(y_pred - y_test) > std_dev_residuals
fig.add_trace(go.Scatter3d(
    x=X_test["Altitude m"][outside_std_dev],
    y=X_test["Average Temperature C"][outside_std_dev],
    z=X_test["Average Humidity"][outside_std_dev],
    mode='markers',
    marker=dict(
        size=8,  # Size of the spheres
        color=y_pred[outside_std_dev],  # Color by predicted PUE values
        colorscale='reds',  # Different color scale for points outside std deviation
        symbol='circle',  # X marker for points outside standard deviation
        opacity=0.7
    ),
    text=[f'Actual TTM PUE: {val}<br>Predicted TTM PUE: {pred:.2f}' for val, pred in zip(y_test[outside_std_dev], y_pred[outside_std_dev])],  # Display actual and predicted PUE values on hover
    hoverinfo='text',  # Show the 'text' when hovering
    name='Misses'
))

# Update layout for interactive plot
fig.update_layout(
    title='Machine Learning Model Evaluation: PUE Predictions',
    scene=dict(
        xaxis=dict(
            title='Altitude m',
            tickfont=dict(color='white'),
            titlefont=dict(color='white'),
            showgrid=True, gridcolor="white",  # White grid lines
            zeroline=False,  
            backgroundcolor='rgba(0,0,0,0)'  # Makes the plane transparent
        ),
        yaxis=dict(
            title='Average Temperature C',
            tickfont=dict(color='white'),
            titlefont=dict(color='white'),
            showgrid=True, gridcolor="white",
            zeroline=False,
            backgroundcolor='rgba(0,0,0,0)'  # Transparent plane
        ),
        zaxis=dict(
            title='Average Humidity',
            tickfont=dict(color='white'),
            titlefont=dict(color='white'),
            showgrid=True, gridcolor="white",
            zeroline=False,
            backgroundcolor='rgba(0,0,0,0)'  # Transparent plane
        ),
        bgcolor="black"  # Keep the 3D plot background black
    ),
    showlegend=True,
    hovermode='closest',
    margin=dict(l=100, r=300, t=50, b=50),
    paper_bgcolor='black',
    plot_bgcolor='black',
    font=dict(color='white'),
    annotations=[
        dict(
            x=1.6, y=0.3,
            xref='paper', yref='paper',
            text=f'<b>R-squared:</b> {r2:.2f}',
            showarrow=False,
            font=dict(size=12, color='white'),
            align='left'
        ),
        dict(
            x=1.6, y=0.2,
            xref='paper', yref='paper',
            text=f'<b>Mean Squared Error:</b> {mse:.5f}',
            showarrow=False,
            font=dict(size=12, color='white'),
            align='left'
        ),
        dict(
            x=1.6, y=0.1,
            xref='paper', yref='paper',
            text=f'<b>Percent Within Residual Std Dev:</b> {percentage_correct_within_std_dev:.2f}%',
            showarrow=False,
            font=dict(size=12, color='white'),
            align='left'
        ),
        dict(
            x=1.6, y=0,
            xref='paper', yref='paper',
            text=f'<b>Residual Std Dev:</b> {std_dev_residuals:.2f}',
            showarrow=False,
            font=dict(size=12, color='white'),
            align='left'
        ),
        dict(
            x=-0.2, y=0,
            xref='paper', yref='paper',
            text=copyright_text,
            showarrow=False,
            font=dict(size=12, color='white'),
            align='left'
        )
    ]
)

# Show the plot
fig.show()


Standard Deviation of Residuals (Actual - Predicted PUE): 0.02050540061309192
Mean Squared Error: 0.0004230088495575216
R-squared: 0.49707635009310913
Percentage of predicted points within 1 standard deviation of actual PUE: 79.65%


In [258]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset from uploaded Excel file
df = pd.read_excel("/kaggle/input/datacenterstats/updated_stats_file.xlsx")  # Replace with actual file name

# Drop missing values
df = df.dropna()

# Features and target
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(1, 10000),  # Number of trees in the forest
    'max_depth': randint(1, 10000),         # Maximum depth of the trees
    'min_samples_split': randint(1, 1000), # Minimum samples required to split a node
    'min_samples_leaf': randint(1,1000),  # Minimum samples required to be a leaf node
}

# Create RandomForestRegressor model
rf = RandomForestRegressor(random_state=42)

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf, param_distributions=param_dist, n_iter=50, 
    scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1, random_state=42
)
random_search.fit(X_train, y_train)

# Output best parameters
print("Best parameters found:", random_search.best_params_)

# Evaluate best model on test set
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Test MSE:", mse)
print("Test R-squared:", r2)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found: {'max_depth': 3841, 'min_samples_leaf': 5, 'min_samples_split': 218, 'n_estimators': 503}
Test MSE: 0.0006094214488621806
Test R-squared: 0.27544669641329556


In [281]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset from uploaded Excel file
df = pd.read_excel("/kaggle/input/datacenterstats/updated_stats_file.xlsx")  # Replace with actual file name

# Drop missing values
df = df.dropna()

# Features and target
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(
    n_estimators=218,         # Number of trees in the forest
    max_depth=131,            # Maximum depth of each tree
    min_samples_split=5,    # Minimum samples required to split an internal node
    min_samples_leaf=1,       # Minimum samples required to be at a leaf node
    random_state=42           # For reproducibility
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error (MSE) and R-squared score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output the results
print(f"Test MSE: {mse}")
print(f"Test R-squared: {r2}")


Test MSE: 0.00043731249180366016
Test R-squared: 0.4800704648192782


In [164]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset from uploaded Excel file
df = pd.read_excel("/kaggle/input/datacenterstats/updated_stats_file.xlsx")  # Replace with actual file name

# Drop missing values
df = df.dropna()

# Assuming df is your dataframe
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]
df = df.dropna()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Lasso regression model with alpha (regularization strength)
model = Lasso(alpha=0.0001, random_state=42)  # Adjust alpha if needed

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.0006053964997973133
R-squared: 0.2802320385556877


In [267]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]
df = df.dropna()  # Remove NaN values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Ridge regression model with alpha (regularization strength)
model = Ridge(random_state=42)  # Adjust alpha if needed

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.0006052241930281404
R-squared: 0.28043689750685785


In [184]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR
from scipy.stats import uniform
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load dataset from uploaded Excel file
df = pd.read_excel("/kaggle/input/datacenterstats/updated_stats_file.xlsx")  # Replace with actual file name

# Drop missing values
df = df.dropna()

# Assuming df is your dataframe
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Support Vector Regression model (SVR)
svr = SVR()

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'C': uniform(0.1, 100),  # Random range for C
    'epsilon': uniform(0.01, 0.5),  # Random range for epsilon
    'kernel': ['linear', 'poly', 'rbf']  # Kernel types
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=svr,             # Estimator: SVR
    param_distributions=param_dist,  # Hyperparameter search space
    n_iter=10,                # Number of iterations (trials)
    cv=5,                      # Cross-validation folds
    scoring='neg_mean_squared_error',  # Score metric to optimize
    n_jobs=-1,                 # Use all CPU cores for parallelization
    random_state=42,            # Random state for reproducibility
    verbose=1
)

# Train the model using RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters found after random search
best_params_random = random_search.best_params_
print(f"Best Parameters (Random Search): {best_params_random}")

# Get the best model after random search
best_model = random_search.best_estimator_

# Predict on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters (Random Search): {'C': 40.08609717152555, 'epsilon': 0.033332831606807715, 'kernel': 'rbf'}
Mean Squared Error: 0.0004571220924071917
R-squared: 0.45651843594530284


In [280]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load dataset from uploaded Excel file
df = pd.read_excel("/kaggle/input/datacenterstats/updated_stats_file.xlsx")  # Replace with actual file name

# Drop missing values
df = df.dropna()

# Assuming df is your dataframe
X = df[["Altitude m", "Average Temperature C", "Average Humidity"]]
y = df["Trailing twelve-month TTM PUE"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Create the Support Vector Regression model (SVR)
model = SVR(kernel='rbf', C=40, epsilon=0.02)  # You can tune C, epsilon, and kernel

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.0004222079104144818
R-squared: 0.49802860259939163
