# Grouping

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from factor_analyzer import FactorAnalyzer, calculate_kmo
from scipy.stats import bartlett

# Load the data from your CSV file
file_name = 'data.csv'
data = pd.read_csv(file_name)

# Clean and preprocess the data (adjust the columns to your data)
data.columns = data.columns.str.strip().str.upper()

# Select the relevant question columns for analysis (customize this part based on your variables)
questions_columns = [col for col in data.columns if col.startswith(('FAS', 'LOC', 'S', 'QOA', 'I', 'M', 'STS', 'IPS', 'SSRL', 'SDS'))]
questions_data = data[questions_columns]

# Handle missing values by filling with column means
questions_data.fillna(questions_data.mean(), inplace=True)

# Standardize the data
scaler = StandardScaler()
questions_data_scaled = scaler.fit_transform(questions_data)

# Perform KMO and Bartlett's tests
kmo_all, kmo_model = calculate_kmo(questions_data_scaled)
print(f"KMO Test Score: {kmo_model:.3f}")

chi_square_value, p_value = bartlett(*[questions_data[col] for col in questions_columns])
print(f"Bartlett’s Test p-value: {p_value:.3f}")

# Factor Analysis
fa = FactorAnalyzer(rotation=None)
fa.fit(questions_data_scaled)

# Get eigenvalues to determine the optimal number of factors
eigenvalues, _ = fa.get_eigenvalues()

# Plot Scree Plot
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.axhline(y=1, color='r', linestyle='-')
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.show()

# Determine the optimal number of factors based on eigenvalues > 1
optimal_factors = sum(eigenvalues > 1)
print(f'Optimal number of factors: {optimal_factors}')

# Perform Factor Analysis with the optimal number of factors
fa_optimal = FactorAnalyzer(n_factors=optimal_factors, rotation='promax')
fa_optimal.fit(questions_data_scaled)

# Get Factor Loadings
factor_loadings = pd.DataFrame(fa_optimal.loadings_, index=questions_columns)

# Save factor loadings to CSV
factor_loadings.to_csv('factor_loadings.csv')

# Assign sequential group numbers
factor_loadings_df = factor_loadings.copy()
factor_loadings_df['Group'] = np.argmax(fa_optimal.loadings_, axis=1) + 1

# Ensure no missing factor numbers
unique_factors = np.unique(factor_loadings_df['Group'])
factor_mapping = {old_group: new_group for new_group, old_group in enumerate(sorted(unique_factors), start=1)}
factor_loadings_df['Group'] = factor_loadings_df['Group'].map(factor_mapping)

# Create a DataFrame for grouped questions
grouped_questions_df = factor_loadings_df.reset_index()
grouped_questions_df.columns = ['Question'] + [f'Factor_{i + 1}' for i in range(optimal_factors)] + ['Group']
grouped_questions_df = grouped_questions_df[['Question', 'Group']]

# Sort the DataFrame by 'Group' column
grouped_questions_df.sort_values(by='Group', inplace=True)

# Reset index after sorting
grouped_questions_df.reset_index(drop=True, inplace=True)

# Rename the Group column for clarity
grouped_questions_df.rename(columns={'Group': 'Factor Group'}, inplace=True)

# Plot the factor loadings
plt.figure(figsize=(14, 8))
sns.set(style="whitegrid")

# Create a color palette for the factors
palette = sns.color_palette("husl", len(grouped_questions_df['Factor Group'].unique()))

# Plot each factor with a different color
for factor, color in zip(grouped_questions_df['Factor Group'].unique(), palette):
    subset = grouped_questions_df[grouped_questions_df['Factor Group'] == factor]
    plt.scatter(subset['Question'], [factor] * len(subset), label=f'Factor {factor}', s=100, edgecolor='k', color=color)

plt.xticks(rotation=90)
plt.xlabel('Question')
plt.ylabel('Factor Group')
plt.title('Grouping of Questions by Factors')
plt.legend(title='Factor Group')
plt.tight_layout()

# Save the plot
plt.savefig('grouped_questions_plot.png')
plt.show()

# Save the grouped questions to CSV
grouped_questions_df.to_csv('grouped_questions.csv', index=False)

# Display the grouped questions
print("Grouped Questions by Factor:")
print(grouped_questions_df)

# Load factor loadings data from CSV
factor_loadings_df = pd.read_csv('factor_loadings.csv', index_col=0)

# Create a DataFrame for mapping groups
group_mapping = dict(zip(grouped_questions_df['Question'], grouped_questions_df['Factor Group']))

# Convert factor loadings to a NumPy array for faster calculations
factor_loadings_array = factor_loadings_df.iloc[:, :-1].values

# Calculate mean distances between each pair of groups
unique_groups = grouped_questions_df['Factor Group'].unique()

distance_matrix = pd.DataFrame(np.zeros((len(unique_groups), len(unique_groups))), 
                               index=unique_groups, columns=unique_groups)

group_ids = [group_mapping[q] for q in factor_loadings_df.index]

for group_i in unique_groups:
    for group_j in unique_groups:
        if group_i != group_j:
            indices_i = [idx for idx, group in enumerate(group_ids) if group == group_i]
            indices_j = [idx for idx, group in enumerate(group_ids) if group == group_j]

            distances = []
            
            for idx_i in indices_i:
                for idx_j in indices_j:
                    dist = np.abs(factor_loadings_array[idx_i] - factor_loadings_array[idx_j])
                    distances.extend(dist)
            
            mean_distance = np.mean(distances)
            distance_matrix.loc[group_i, group_j] = mean_distance

# Display the matrix showing the mean distances between each pair of groups
print("Mean Distances Between Groups:")
print(distance_matrix)

# Save the distance matrix to CSV
distance_matrix.to_csv('mean_distances_between_groups.csv')

# Calculate mean distances within each group
group_mean_distances = {}

for group in unique_groups:
    group_questions = [q for q in factor_loadings_df.index if group_mapping.get(q) == group]

    if len(group_questions) < 2:
        group_mean_distances[group] = 0
        continue

    group_data = factor_loadings_df.loc[group_questions].iloc[:, :-1]

    pairwise_diffs = []
    for i in range(len(group_questions)):
        for j in range(i + 1, len(group_questions)):
            dist = np.abs(group_data.iloc[i] - group_data.iloc[j])
            pairwise_diffs.extend(dist)

    group_mean_distance = np.mean(pairwise_diffs)
    group_mean_distances[group] = group_mean_distance

# Display mean distances within each group
for group, mean_distance in group_mean_distances.items():
    print(f"Mean distance within group {group}: {mean_distance:.4f}")

# Save the mean distances to a CSV file
mean_distances_df = pd.DataFrame.from_dict(group_mean_distances, orient='index', columns=['Mean Distance'])
mean_distances_df.to_csv('mean_distances_within_groups.csv')



# Random forest

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load your data from the CSV file
data = pd.read_csv('data.csv')  # Update 'data.csv' with the actual filename

# Check for missing values
if data.isnull().sum().any():
    print("Missing values found. Handling missing values...")
    data = data.dropna()  # Optionally, you can fill missing values instead

# Complete variable mapping including all relevant columns for each construct
var_mapping = {
    'Autonomy': ['SDS1', 'SDS4', 'SDS8', 'SDS11', 'SDS14', 'SDS17', 'SDS20'],
    'Competence': ['SDS3', 'SDS5', 'SDS10', 'SDS13', 'SDS15', 'SDS19'],
    'Relatedness': ['SDS6', 'SDS7', 'SDS9', 'SDS12', 'SDS16', 'SDS18', 'SDS21'],
    'Self_Regulation_Learning': ['SSRL1', 'SSRL2', 'SSRL3', 'SSRL4', 'SSRL5', 
                                  'SSRL6', 'SSRL7', 'SSRL8', 'SSRL9', 'SSRL10', 
                                  'SSRL11', 'SSRL12', 'SSRL13', 'SSRL14', 
                                  'SSRL15', 'SSRL16', 'SSRL17'],
    'Irrational_Procrastination': ['IPS1', 'IPS2', 'IPS3', 'IPS4', 'IPS5', 
                                    'IPS6', 'IPS7', 'IPS8', 'IPS9'],
    'Susceptibility_to_Temptation': ['STS1', 'STS2', 'STS3', 'STS4', 'STS5', 
                                      'STS6', 'STS7', 'STS8', 'STS9', 'STS10', 'STS11'],
    'Academic_Commitment': ['LOC1', 'LOC2', 'LOC3', 'LOC4', 'LOC5', 'S1', 'S2', 
                            'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'QOA1', 
                            'QOA2', 'QOA3', 'I1', 'I2', 'I3', 'I4', 
                            'I5', 'M1', 'M2', 'M3', 'M4', 'M5', 
                            'M6', 'M7', 'M8', 'M9'],
    'Family_Academic_Support': ['FAS1', 'FAS2', 'FAS3', 'FAS4', 'FAS5', 
                                 'FAS6', 'FAS7', 'FAS8', 'FAS9', 'FAS10', 
                                 'FAS11', 'FAS12', 'FAS13', 'FAS14', 'FAS15']
}

# Define the hypotheses with their corresponding columns
hypotheses = [
    {"IV": var_mapping['Competence'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H1"},
    {"IV": var_mapping['Competence'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H2"},
    {"IV": var_mapping['Competence'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H3"},
    
    {"IV": var_mapping['Competence'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H4"},
    {"IV": var_mapping['Competence'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H5"},
    {"IV": var_mapping['Competence'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H6"},
    
    {"IV": var_mapping['Autonomy'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H7"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H8"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H9"},
    
    {"IV": var_mapping['Autonomy'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H10"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H11"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H12"},
    
    {"IV": var_mapping['Relatedness'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H13"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H14"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H15"},
    
    {"IV": var_mapping['Relatedness'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H16"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H17"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H18"},
]

# Function to run Gradient Boosting regression
def run_gradient_boosting(independent_vars, dependent_var, data):
    X = data[independent_vars]  # Selecting the independent variables
    Y = data[dependent_var].values.ravel()  # Ensure Y is a 1D array
    
    # Split the data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # Initialize the Gradient Boosting Regressor
    model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    
    # Fit the model
    model.fit(X_train, Y_train)
    
    # Predict on the test set
    Y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    
    # Get feature importance
    feature_importances = model.feature_importances_
    
    return mse, r2, feature_importances

# Set up for visualization
plt.figure(figsize=(16, 12))

# Run the Gradient Boosting regression for each hypothesis and print the results
for i, hypothesis in enumerate(hypotheses):
    IVs = hypothesis['IV']
    DV = hypothesis['DV'][0]  # Make sure to select the first element as DV
    
    # Execute the Gradient Boosting regression
    mse, r2, feature_importances = run_gradient_boosting(IVs, DV, data)
    
    # Print the results
    print(f"Results for Hypothesis {hypothesis['hypothesis_num']}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("Feature Importances:")
    
    # Create a DataFrame for feature importances
    importance_df = pd.DataFrame({
        'Feature': IVs,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    # Print feature importances
    for feature, importance in zip(importance_df['Feature'], importance_df['Importance']):
        print(f"{feature}: {importance:.4f}")

    # Interpret the results
    if r2 < 0:
        interpretation = "The model does not explain any variance in the dependent variable."
    elif r2 < 0.1:
        interpretation = "The model explains very little variance."
    elif r2 < 0.3:
        interpretation = "The model explains a modest amount of variance."
    elif r2 < 0.5:
        interpretation = "The model explains a substantial amount of variance."
    else:
        interpretation = "The model explains a large amount of variance."
    
    print(f"Interpretation of R-squared: {interpretation}\n")
    
    # Plot feature importances
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title(f'Feature Importance for Hypothesis {hypothesis["hypothesis_num"]}')
    plt.show()


# Gradient Boosting Regressor

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load your data from the CSV file
data = pd.read_csv('data.csv')  # Update 'data.csv' with the actual filename

# Check for missing values
if data.isnull().sum().any():
    print("Missing values found. Handling missing values...")
    data = data.dropna()  # Optionally, you can fill missing values instead

# Complete variable mapping including all relevant columns for each construct
var_mapping = {
    'Autonomy': ['SDS1', 'SDS4', 'SDS8', 'SDS11', 'SDS14', 'SDS17', 'SDS20'],
    'Competence': ['SDS3', 'SDS5', 'SDS10', 'SDS13', 'SDS15', 'SDS19'],
    'Relatedness': ['SDS6', 'SDS7', 'SDS9', 'SDS12', 'SDS16', 'SDS18', 'SDS21'],
    'Self_Regulation_Learning': ['SSRL1', 'SSRL2', 'SSRL3', 'SSRL4', 'SSRL5', 
                                  'SSRL6', 'SSRL7', 'SSRL8', 'SSRL9', 'SSRL10', 
                                  'SSRL11', 'SSRL12', 'SSRL13', 'SSRL14', 
                                  'SSRL15', 'SSRL16', 'SSRL17'],
    'Irrational_Procrastination': ['IPS1', 'IPS2', 'IPS3', 'IPS4', 'IPS5', 
                                    'IPS6', 'IPS7', 'IPS8', 'IPS9'],
    'Susceptibility_to_Temptation': ['STS1', 'STS2', 'STS3', 'STS4', 'STS5', 
                                      'STS6', 'STS7', 'STS8', 'STS9', 'STS10', 'STS11'],
    'Academic_Commitment': ['LOC1', 'LOC2', 'LOC3', 'LOC4', 'LOC5', 'S1', 'S2', 
                            'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'QOA1', 
                            'QOA2', 'QOA3', 'I1', 'I2', 'I3', 'I4', 
                            'I5', 'M1', 'M2', 'M3', 'M4', 'M5', 
                            'M6', 'M7', 'M8', 'M9'],
    'Family_Academic_Support': ['FAS1', 'FAS2', 'FAS3', 'FAS4', 'FAS5', 
                                 'FAS6', 'FAS7', 'FAS8', 'FAS9', 'FAS10', 
                                 'FAS11', 'FAS12', 'FAS13', 'FAS14', 'FAS15']
}

# Define the hypotheses with their corresponding columns
hypotheses = [
    {"IV": var_mapping['Competence'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H1"},
    {"IV": var_mapping['Competence'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H2"},
    {"IV": var_mapping['Competence'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H3"},
    
    {"IV": var_mapping['Competence'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H4"},
    {"IV": var_mapping['Competence'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H5"},
    {"IV": var_mapping['Competence'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H6"},
    
    {"IV": var_mapping['Autonomy'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H7"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H8"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H9"},
    
    {"IV": var_mapping['Autonomy'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H10"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H11"},
    {"IV": var_mapping['Autonomy'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H12"},

    {"IV": var_mapping['Relatedness'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H13"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H14"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Irrational_Procrastination'], "hypothesis_num": "H15"},
    
    {"IV": var_mapping['Relatedness'] + var_mapping['Academic_Commitment'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H16"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Self_Regulation_Learning'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H17"},
    {"IV": var_mapping['Relatedness'] + var_mapping['Family_Academic_Support'], "DV": var_mapping['Susceptibility_to_Temptation'], "hypothesis_num": "H18"},
]

# Function to run Gradient Boosting regression
def run_gradient_boosting(independent_vars, dependent_var, data):
    X = data[independent_vars]  # Selecting the independent variables
    Y = data[dependent_var].values.ravel()  # Ensure Y is a 1D array
    
    # Split the data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # Initialize the Gradient Boosting Regressor
    model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    
    # Fit the model
    model.fit(X_train, Y_train)
    
    # Predict on the test set
    Y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    
    # Get feature importance
    feature_importances = model.feature_importances_
    
    return mse, r2, feature_importances

# Set up for visualization
plt.figure(figsize=(16, 12))

# Run the Gradient Boosting regression for each hypothesis and print the results
for i, hypothesis in enumerate(hypotheses):
    IVs = hypothesis['IV']
    DV = hypothesis['DV'][0]  # Make sure to select the first element as DV
    
    # Execute the Gradient Boosting regression
    mse, r2, feature_importances = run_gradient_boosting(IVs, DV, data)
    
    # Print the results
    print(f"Results for Hypothesis {hypothesis['hypothesis_num']}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("Feature Importances:")
    
    # Create a DataFrame for feature importances
    importance_df = pd.DataFrame({
        'Feature': IVs,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    # Print feature importances
    for feature, importance in zip(importance_df['Feature'], importance_df['Importance']):
        print(f"{feature}: {importance:.4f}")

    # Interpret the results
    if r2 < 0:
        interpretation = "The model does not explain any variance in the dependent variable."
    elif r2 < 0.1:
        interpretation = "The model explains a very small amount of variance in the dependent variable."
    elif r2 < 0.3:
        interpretation = "The model explains a small amount of variance in the dependent variable."
    elif r2 < 0.5:
        interpretation = "The model explains a moderate amount of variance in the dependent variable."
    elif r2 < 0.7:
        interpretation = "The model explains a substantial amount of variance in the dependent variable."
    else:
        interpretation = "The model explains a high amount of variance in the dependent variable."
    
    print(f"Interpretation: {interpretation}\n")

    # Visualization of feature importance
    sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
    plt.title(f'Feature Importance for Hypothesis {hypothesis["hypothesis_num"]}')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.xlim(0, importance_df['Importance'].max() * 1.1)  # Set xlim to better visualize importance
    plt.grid(axis='x')
    plt.show()
