In [46]:
# Persephone Moran
# INST414 Sprint 3 Final Analysis
# May 18, 2025

import pandas as pd
import statsmodels.api as sm
from datetime import datetime

# Load the raw HR dataset
employee_data = pd.read_csv('hr_dashboard_data.csv')

# Convert joining date to datetime and calculate tenure in years
employee_data['Joining Date'] = pd.to_datetime(employee_data['Joining Date'], format='%b-%y')
today = pd.to_datetime('2025-05-18')  # Fixed reference point for tenure
employee_data['Tenure (Years)'] = (today - employee_data['Joining Date']).dt.days / 365

# Drop non-numeric or non-predictive columns (e.g., Name, Joining Date)
employee_data_cleaned = employee_data.drop(columns=['Name', 'Joining Date'])

# One-hot encode categorical variables and drop the first category to avoid multicollinearity
employee_data_encoded = pd.get_dummies(employee_data_cleaned, columns=['Gender', 'Department', 'Position'], drop_first=True)

# Ensure compatibility with regression modeling (cast all values to integers where needed)
employee_data_encoded = employee_data_encoded.astype(int)

# Save the cleaned and encoded dataset for reproducibility
employee_data_encoded.to_csv('employee_data_encoded.csv', index=False)

# Set up independent and dependent variables for the Productivity regression
X_prod = employee_data_encoded.drop(columns=['Productivity (%)', 'Satisfaction Rate (%)'])
y_prod = employee_data_encoded['Productivity (%)']
X_prod = sm.add_constant(X_prod)  # Add intercept term
model_prod = sm.OLS(y_prod, X_prod).fit()

# Output the summary for the Productivity model
print("=== Productivity Regression Results ===")
print(model_prod.summary())

# Set up independent and dependent variables for the Satisfaction regression
X_satisf = sm.add_constant(employee_data_encoded.drop(columns=['Productivity (%)', 'Satisfaction Rate (%)']))
y_satisf = employee_data_encoded['Satisfaction Rate (%)']
model_satisf = sm.OLS(y_satisf, X_satisf).fit()

# Output the summary for the Satisfaction model
print("\n=== Satisfaction Regression Results ===")
print(model_satisf.summary())

=== Productivity Regression Results ===
                            OLS Regression Results                            
Dep. Variable:       Productivity (%)   R-squared:                       0.080
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.065
Date:                Sat, 17 May 2025   Prob (F-statistic):              0.392
Time:                        22:08:57   Log-Likelihood:                -945.15
No. Observations:                 200   AIC:                             1922.
Df Residuals:                     184   BIC:                             1975.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------