In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

def compute_variable_importance(file_path, target_column):
    """
    Compute Gini-based variable importance using Random Forest and plot the results.
    
    Parameters:
        file_path (str): Path to the Excel file containing the dataset.
        target_column (str): Name of the target (dependent) variable.
    
    Returns:
        pd.DataFrame: DataFrame with variable names and their importance scores.
    """
    # Load the dataset
    data = pd.read_excel(file_path)
    
    # Separate features and target variable
    features = [col for col in data.columns if col != target_column]
    X = data[features]
    y = data[target_column]
    
    # Initialize and fit the Random Forest Classifier
    rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
    rf_model.fit(X, y)
    
    # Extract Gini importance (feature importance)
    feature_importance = pd.DataFrame({
        'Variable': features,
        'Importance': rf_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    # Plotting the feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Variable'], feature_importance['Importance'], color='skyblue')
    plt.xlabel('Gini Importance')
    #plt.title('Feature Importance based on Gini Index')
    plt.gca().invert_yaxis()  # Highest importance at the top
    plt.tight_layout()
    
    # Save the plot
    plot_path = "Variable_Importance_Plot.png"
    plt.savefig(plot_path)
    plt.close()
    
    print(f"Feature importance plot saved to {plot_path}")
    
    return feature_importance

# Example Usage
file_path = "Training.xlsx"  # Replace with the path to your dataset
target_column = "Target"      # Replace with the name of your target variable

# Compute variable importance
variable_importance = compute_variable_importance(file_path, target_column)

# Save the results to an Excel file
output_path = "Variable_Importance_shima.xlsx"  # Replace with your desired output file path
variable_importance.to_excel(output_path, index=False)

print(f"Variable importance saved to {output_path}")

Feature importance plot saved to Variable_Importance_Plot.png
Variable importance saved to Variable_Importance_shima.xlsx


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def compute_variable_correlation(file_path, target_column):
    """
    Compute Pearson correlation coefficients between features and target variable and plot the results.
    
    Parameters:
        file_path (str): Path to the Excel file containing the dataset.
        target_column (str): Name of the target (dependent) variable.
    
    Returns:
        pd.DataFrame: DataFrame with variable names and their Pearson correlation coefficients.
    """
    # Load the dataset
    data = pd.read_excel(file_path)
    
    # Separate features and target variable
    features = [col for col in data.columns if col != target_column]
    X = data[features]
    y = data[target_column]
    
    # Compute Pearson correlation coefficients
    correlations = X.corrwith(y, method='pearson')
    correlation_df = pd.DataFrame({
        'Variable': features,
        'Correlation': correlations
    }).sort_values(by='Correlation', ascending=False)
    
    # Create a colormap for gradual color change
    norm = plt.Normalize(correlation_df['Correlation'].min(), correlation_df['Correlation'].max())
    colors = plt.cm.viridis(norm(correlation_df['Correlation']))
    
    # Plotting the correlations
    plt.figure(figsize=(12, 6))
    plt.bar(correlation_df['Variable'], correlation_df['Correlation'], color=colors)
    plt.ylabel('Pearson Correlation')
    plt.xlabel('Features')
    plt.title('Feature Correlation with Target Variable')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Save the plot
    plot_path = "Variable_Correlation_Plot.png"
    plt.savefig(plot_path)
    plt.close()
    
    print(f"Feature correlation plot saved to {plot_path}")
    
    return correlation_df

# Example Usage
file_path = "Training.xlsx"  # Replace with the path to your dataset
target_column = "Target"      # Replace with the name of your target variable

# Compute variable correlation
variable_correlation = compute_variable_correlation(file_path, target_column)

# Save the results to an Excel file
output_path = "Variable_Correlation_shima.xlsx"  # Replace with your desired output file path
variable_correlation.to_excel(output_path, index=False)

print(f"Variable correlation saved to {output_path}")

Feature correlation plot saved to Variable_Correlation_Plot.png
Variable correlation saved to Variable_Correlation_shima.xlsx


In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def compute_variable_correlation(file_path, target_column):
    """
    Compute Pearson correlation coefficients between features and target variable and plot the results.
    Features on the x-axis are ordered as they appear in the Excel file.
    
    Parameters:
        file_path (str): Path to the Excel file containing the dataset.
        target_column (str): Name of the target (dependent) variable.
    
    Returns:
        pd.DataFrame: DataFrame with variable names and their Pearson correlation coefficients.
    """
    # Load the dataset
    data = pd.read_excel(file_path)
    
    # Get features in the order they appear in the Excel file
    features = [col for col in data.columns if col != target_column]
    X = data[features]
    y = data[target_column]
    
    # Compute Pearson correlation coefficients
    correlations = X.corrwith(y, method='pearson')
    correlation_df = pd.DataFrame({
        'Variable': features,
        'Correlation': correlations
    })  # Keep original column order, no sorting
    
    # Create a colormap for gradual color change
    norm = plt.Normalize(correlation_df['Correlation'].min(), correlation_df['Correlation'].max())
    colors = plt.cm.viridis(norm(correlation_df['Correlation']))
    
    # Plotting the correlations
    plt.figure(figsize=(12, 6))
    plt.bar(correlation_df['Variable'], correlation_df['Correlation'], color=colors)
    plt.ylabel('Pearson Correlation', fontsize=16)
    plt.xlabel('Features', fontsize=16)
    #plt.title('Feature Correlation with Target Variable', fontsize=16)
    plt.xticks(rotation=45, ha='right', fontsize=16)
    plt.yticks(fontsize=16)
    plt.tight_layout()
    
    # Save the plot
    plot_path = "Variable_Correlation_Plot.png"
    plt.savefig(plot_path)
    plt.close()
    
    print(f"Feature correlation plot saved to {plot_path}")
    
    return correlation_df

# Example Usage
file_path = "Training.xlsx"  # Replace with the path to your dataset
target_column = "Target"      # Replace with the name of your target variable

# Compute variable correlation
variable_correlation = compute_variable_correlation(file_path, target_column)

# Save the results to an Excel file
output_path = "Variable_Correlation_shima.xlsx"  # Replace with your desired output file path
variable_correlation.to_excel(output_path, index=False)

print(f"Variable correlation saved to {output_path}")

Feature correlation plot saved to Variable_Correlation_Plot.png
Variable correlation saved to Variable_Correlation_shima.xlsx


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

def compute_variable_importance(file_path, target_column):
    """
    Compute Gini-based variable importance using Random Forest and plot the results.
    
    Parameters:
        file_path (str): Path to the Excel file containing the dataset.
        target_column (str): Name of the target (dependent) variable.
    
    Returns:
        pd.DataFrame: DataFrame with variable names and their importance scores.
    """
    # Load the dataset
    data = pd.read_excel(file_path)
    
    # Separate features and target variable
    features = [col for col in data.columns if col != target_column]
    X = data[features]
    y = data[target_column]
    
    # Initialize and fit the Random Forest Classifier
    rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
    rf_model.fit(X, y)
    
    # Extract Gini importance (feature importance)
    feature_importance = pd.DataFrame({
        'Variable': features,
        'Importance': rf_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    # Plotting the feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Variable'], feature_importance['Importance'], color='skyblue')
    plt.xlabel('Gini Importance', fontsize=14)
    #plt.title('Feature Importance based on Gini Index', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.gca().invert_yaxis()  # Highest importance at the top
    plt.tight_layout()
    
    # Save the plot
    plot_path = "Variable_Importance_Plot.png"
    plt.savefig(plot_path)
    plt.close()
    
    print(f"Feature importance plot saved to {plot_path}")
    
    return feature_importance

# Example Usage
file_path = "Training.xlsx"  # Replace with the path to your dataset
target_column = "Target"      # Replace with the name of your target variable

# Compute variable importance
variable_importance = compute_variable_importance(file_path, target_column)

# Save the results to an Excel file
output_path = "Variable_Importance_shima.xlsx"  # Replace with your desired output file path
variable_importance.to_excel(output_path, index=False)

print(f"Variable importance saved to {output_path}")

Feature importance plot saved to Variable_Importance_Plot.png
Variable importance saved to Variable_Importance_shima.xlsx
