In [None]:
import plotly.express as px
import pandas as pd
import sqlite3 as db
import time
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FuncFormatter
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.colors as mcolors

# Connect to your SQLite database
con = db.connect('mydatabase.db')

# Load the table data into a pandas DataFrame
df_grundkapital = pd.read_sql_query('SELECT * FROM "H2_final_subsectors_F"', con)


# Commit the changes and close the connection
con.commit()
con.close()

In [None]:
df_grundkapital.head()
print("Elements in the dataframe:",len(df_grundkapital))

### 1. Plots removing outliers and separating sectors

In [None]:
def lighten_color(color, factor=1.9):
    # Lightens the given color by multiplying the RGB values by a factor.
    color = mcolors.to_rgb(color)
    lightened_color = np.clip(np.array(color) * factor, 0, 1)
    return lightened_color

# Function to plot the capital types with different shades for each subsector
def plot_subsector_with_capital_types(df, subsector_name, base_colormap, title):
    # Filter out rows where all capital types are 0
    df = df[(df['Stammkapital'] > 0) | (df['Grundkapital'] > 0) | (df['Hafteinlage'] > 0)]

    # Convert H2-Potential from TWh to GWh
    df['Distributed H2-Potential in GWh'] = df['Distributed H2-Potential in TWh'] * 1000
    df['Distributed H2-Potential in MWh'] = df['Distributed H2-Potential in GWh'] * 1000

    # Create the plot
    plt.figure(figsize=(20, 14))

    base_color = plt.get_cmap(base_colormap)(0.6)  # Base color for Stammkapital
    dark_color = darken_colormap(base_colormap)(0.4)  # Darker color for Grundkapital
    light_color = lighten_color(base_color, factor=1.9)  # Lighter color for Hafteinlage

    # Plot Stammkapital, Grundkapital, and Hafteinlage when values are > 0
    if not df[df['Stammkapital'] > 0].empty:
        plt.scatter(df['Stammkapital'], df['Distributed H2-Potential in GWh'], s=100, color=base_color, label='Stammkapital')
    if not df[df['Grundkapital'] > 0].empty:
        plt.scatter(df['Grundkapital'], df['Distributed H2-Potential in GWh'], s=100, color=dark_color, label='Grundkapital')
    if not df[df['Hafteinlage'] > 0].empty:
        plt.scatter(df['Hafteinlage'], df['Distributed H2-Potential in GWh'], s=100, color=light_color, label='Hafteinlage')

    # Extract the variables for the trend line based on the combined 'Capital'
    X = df[['Capital']]
    y = df['Distributed H2-Potential in MWh']

    # Linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Predict y values
    y_pred = model.predict(X)
    y_pred_in_GWh = y_pred / 1000

    # Calculate R^2
    r_squared = r2_score(y, y_pred)

    # Plot the trend line
    plt.plot(df['Capital'], y_pred_in_GWh, color='red', linewidth=2)

    # Add labels and title
    plt.xlabel('Capital (Mio. €)', labelpad=15)
    plt.ylabel('Hydrogen-Potential (GWh)', labelpad=15)
   # plt.title(title, pad=30)

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)

    # Define the custom formatter function for the x-axis (Capital)
    def x_axis_formatter(tick_val, tick_pos):
        return '{:.0f}'.format(tick_val / 1e6)

    # Set the formatter and locator for the x-axis (Capital)
    plt.gca().xaxis.set_major_formatter(FuncFormatter(x_axis_formatter))
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))

    # Display the equation of the line and R^2 value
    equation_text = f'y = {model.coef_[0]:.4f}x + {model.intercept_:.4f} (in MWh and Mio. €)\n$R^2$ = {r_squared:.4f}'
    plt.figtext(0.54, -0.05, equation_text, ha="center", fontsize=25, color='red', wrap=True)

    # Add a legend to distinguish between capital types
    plt.legend()

    # Show the plot
    plt.grid(True)
    plt.tight_layout()

    # Save the plot as an image file
    save_path = os.path.join(save_dir, f'H2pot_{subsector_name.replace(", ", "_").replace(" ", "_")}_flipped_GWh.png')
    plt.savefig(save_path, bbox_inches='tight')
    plt.show()

# Iterate through each subsector and generate the corresponding plot
for subsector, colormap in subsectors_colormap.items():
    # Filter the DataFrame for the current subsector
    df_subsector = df_grundkapital[df_grundkapital['Subsector_Name'].str.contains(subsector, case=False, na=False)].copy()
    plot_subsector_with_capital_types(df_subsector, subsector, colormap, f'H2-Potential in GWh vs. Capital in {subsector.capitalize()}')


### 2. Plots for lower-potential-companies: Threshold to be set as desired

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from matplotlib.ticker import MaxNLocator, FuncFormatter
import sqlite3 as db
import matplotlib.colors as mcolors

# Connect to your SQLite database
con = db.connect('ubicando.db')

# Load the table data into a pandas DataFrame
df_grundkapital = pd.read_sql_query('SELECT * FROM "H2_final_subsectors_F"', con)

# Commit the changes and close the connection
con.commit()
con.close()

# Filter out rows where Capital is 0
df_grundkapital = df_grundkapital[df_grundkapital['Capital'] != 0]

# List of subsectors to analyze with corresponding colormaps
subsectors_colormap = {
    #'refineries': 'Reds',
    'paper and printing': 'Blues',
    'chemical industry': 'Oranges',
    'metal processing': 'Greens',
    'steel, primary': 'Greens',
    'non-metallic minerals': 'Purples',
    'mineral processing': 'Purples'
}

# Set default font sizes for titles, labels, and ticks
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['font.size'] = 14

# Define the directory to save plots
save_dir = 'C:\\Users\\marma\\Documents\\INGENIERIA  INDUSTRIAL\\2º MÁSTER\\TFM\\PLOTS comparing databases'
os.makedirs(save_dir, exist_ok=True)

# Function to remove outliers based on IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Define the custom formatter function for the y-axis (H2-Potential in GWh)
def y_axis_formatter(tick_val, tick_pos):
    return '{:.0f}'.format(tick_val)

# Define the custom formatter function for the x-axis (Capital)
def x_axis_formatter(tick_val, tick_pos):
    return '{:.0f}'.format(tick_val / 1e6)

def darken_color(color, factor=0.5):
    color = mcolors.to_rgb(color)
    darkened_color = np.clip(np.array(color) * factor, 0, 1)
    return darkened_color

def lighten_color(color, factor=1.7):
    color = mcolors.to_rgb(color)
    lightened_color = np.clip(np.array(color) * factor, 0, 1)
    return lightened_color

# Function to perform plotting
def plot_low_potentials_for_subsector(df, subsector_name, colormap, threshold=1):
    # Filter the DataFrame for low H2 potentials
    df_low_h2 = df[df['Distributed H2-Potential in TWh'] <= threshold].copy()
    #df_low_h2 = df[df['Capital'] <= threshold].copy()

    # Convert H2-Potential from TWh to GWh
    df_low_h2['Distributed H2-Potential in GWh'] = df_low_h2['Distributed H2-Potential in TWh'] * 1000
    # Convert H2-Potential from GWh to MWh for trend line calculation
    df_low_h2['Distributed H2-Potential in MWh'] = df_low_h2['Distributed H2-Potential in GWh'] * 1000
    df_low_h2['Distributed H2-Potential in kWh'] = df_low_h2['Distributed H2-Potential in GWh'] * 1000000

    df_low_h2[['Capital thsnd']] = df_low_h2[['Capital']]*1000
    df_low_h2[['Stammkapital thsnd']] = df_low_h2[['Stammkapital']]*1000
    df_low_h2[['Grundkapital thsnd']] = df_low_h2[['Grundkapital']]*1000
    df_low_h2[['Hafteinlage thsnd']] = df_low_h2[['Hafteinlage']]*1000
    
    # Remove outliers from both Capital and H2-Potential in GWh
    df_low_h2 = remove_outliers(df_low_h2, 'Capital thsnd')
    df_low_h2 = remove_outliers(df_low_h2, 'Distributed H2-Potential in GWh')

    if df_low_h2.empty:
        print(f"No data available for low H2 potentials in {subsector_name}. Skipping plot.")
        return

    # Create the plot
    plt.figure(figsize=(12, 10))

    base_color = plt.get_cmap(colormap)(0.6)  # Base color for Stammkapital
    dark_color = darken_color(base_color, factor=0.7)  # Darker color for Grundkapital
    light_color = lighten_color(base_color, factor=1.3)  # Lighter color for Hafteinlage

    # Plot Stammkapital, Grundkapital, and Hafteinlage when values are > 0
    if not df_low_h2[df_low_h2['Stammkapital'] > 0].empty:
        plt.scatter(df_low_h2[df_low_h2['Stammkapital thsnd'] > 0]['Stammkapital thsnd'], df_low_h2[df_low_h2['Stammkapital thsnd'] > 0]['Distributed H2-Potential in GWh'], s=100, color=base_color, label='Stammkapital')
    if not df_low_h2[df_low_h2['Grundkapital'] > 0].empty:
        plt.scatter(df_low_h2[df_low_h2['Grundkapital thsnd'] > 0]['Grundkapital thsnd'], df_low_h2[df_low_h2['Grundkapital thsnd'] > 0]['Distributed H2-Potential in GWh'], s=100, color=dark_color, label='Grundkapital')
    if not df_low_h2[df_low_h2['Hafteinlage'] > 0].empty:
        plt.scatter(df_low_h2[df_low_h2['Hafteinlage thsnd'] > 0]['Hafteinlage thsnd'], df_low_h2[df_low_h2['Hafteinlage thsnd'] > 0]['Distributed H2-Potential in GWh'], s=100, color=light_color, label='Hafteinlage')

    # Extracting the variables
    X = df_low_h2[['Capital thsnd']]
    y = df_low_h2['Distributed H2-Potential in kWh']

    # Linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Predict y values
    y_pred = model.predict(X)
    # Convert predicted y values back to GWh for plotting
    y_pred_in_GWh = y_pred / 1000000

    # Calculate R^2
    r_squared = r2_score(y, y_pred)

    # Plot the trend line
    plt.plot(df_low_h2['Capital thsnd'], y_pred_in_GWh, color='red', linewidth=2)

    # Add labels and title with padding
    plt.xlabel('Capital [Thsnd. €]', labelpad=15)
    plt.ylabel('Hydrogen-Potential [GWh]', labelpad=15)

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)

    # Set the formatter and locator for the x-axis and y-axis
    plt.gca().xaxis.set_major_formatter(FuncFormatter(x_axis_formatter))
    plt.gca().yaxis.set_major_formatter(FuncFormatter(y_axis_formatter))
    plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))

    # Adjust the x-axis limits to focus on the range of the data
    #x_min = df_low_h2['Capital thsnd'].min() * 0.9  # Slightly extend beyond min
    #x_max = df_low_h2['Capital thsnd'].max() * 1.1  # Slightly extend beyond max
    #plt.xlim(x_min, x_max)

    # Adjust the y-axis limits to focus on lower capital values
    if not df_low_h2['Capital thsnd'].empty:
        plt.ylim(df_low_h2['Distributed H2-Potential in GWh'].min() - 0.1, df_low_h2['Distributed H2-Potential in GWh'].max() + 0.1)

    # Display the equation of the line and R^2 value below the plot with more significant figures
    equation_text = f'y = {model.coef_[0]:.4f}x + {model.intercept_:.4f} (in kWh and thsnd. €)\n$R^2$ = {r_squared:.4f}'
    plt.figtext(0.54, -0.05, equation_text, ha="center", fontsize=14, color='red', wrap=True)

    # Add a legend
    plt.legend()

    # Show the plot
    plt.grid(True)
    plt.tight_layout()

    # Save the plot as an image file
    save_path = os.path.join(save_dir, f'Low_H2_Potentials_threshold_1twh_{subsector_name.replace(", ", "_").replace(" ", "_")}_flipped_GWh.png')
    plt.savefig(save_path, bbox_inches='tight')
    plt.show()

# Iterate through each subsector and generate the corresponding plot for low H2 potentials
for subsector, colormap in subsectors_colormap.items():
    # Filter the DataFrame for the current subsector
    df_subsector = df_grundkapital[df_grundkapital['Subsector_Name'].str.contains(subsector, case=False, na=False)].copy()
    plot_low_potentials_for_subsector(df_subsector, subsector, colormap, threshold=1)
