# Hockey Travel Impact Analysis
Analyzing how travel affects team performance metrics

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, pearsonr, spearmanr
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load and Explore the Hockey Dataset

In [None]:
# Load the hockey dataset
df = pd.read_csv('/Users/perrygregg/Downloads/finalhockey_df - away_stress_df.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn Data Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

## 3. Data Cleaning and Preparation

In [None]:
# Data cleaning and preparation
df_clean = df.copy()

# Remove duplicates
df_clean = df_clean.drop_duplicates()
print(f"Shape after removing duplicates: {df_clean.shape}")

# Fill missing values for categorical columns with 'Unknown'
categorical_cols = ['travel_type', 'departure_airport']
for col in categorical_cols:
    df_clean[col] = df_clean[col].fillna('Unknown')

# Ensure numeric columns are properly typed
numeric_cols = ['travel_to_min', 'travel_from_min', 'total_travel_min', 
                'travel_to_hours', 'travel_from_hours', 'total_travel_hours',
                'timezone_shift_hours', 'stress_score_default', 'nd_goals', 
                'opp_goals', 'goal_diff', 'nd_shots_total', 'opp_shots_total']

for col in numeric_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

print("\nCleaned dataset info:")
print(df_clean.info())
print("\nRemaining missing values:")
print(df_clean[numeric_cols].isnull().sum())

## 4. Analyze Travel Impact on Performance Metrics

In [None]:
# Create travel intensity categories
df_clean['travel_intensity'] = pd.cut(df_clean['total_travel_hours'], 
                                      bins=[0, 1.5, 2.5, 4], 
                                      labels=['Low', 'Medium', 'High'])

# Create stress level categories
df_clean['stress_level'] = pd.cut(df_clean['stress_score_default'], 
                                  bins=[0, 30, 45, 100], 
                                  labels=['Low', 'Moderate', 'High'])

# Create timezone shift categories
df_clean['timezone_impact'] = pd.cut(np.abs(df_clean['timezone_shift_hours']), 
                                     bins=[-0.1, 0.5, 1.5, 3], 
                                     labels=['None/Minimal', 'Moderate', 'Significant'])

# Performance metrics comparison
print("="*70)
print("PERFORMANCE BY TRAVEL INTENSITY")
print("="*70)
travel_perf = df_clean.groupby('travel_intensity').agg({
    'is_win': ['mean', 'sum', 'count'],
    'goal_diff': ['mean', 'std'],
    'nd_goals': 'mean',
    'opp_goals': 'mean',
    'nd_shots_total': 'mean',
    'opp_shots_total': 'mean'
}).round(3)
print(travel_perf)

print("\n" + "="*70)
print("PERFORMANCE BY STRESS LEVEL")
print("="*70)
stress_perf = df_clean.groupby('stress_level').agg({
    'is_win': ['mean', 'sum', 'count'],
    'goal_diff': ['mean', 'std'],
    'nd_goals': 'mean',
    'opp_goals': 'mean',
    'stress_score_default': 'mean'
}).round(3)
print(stress_perf)

print("\n" + "="*70)
print("PERFORMANCE BY TIMEZONE IMPACT")
print("="*70)
tz_perf = df_clean.groupby('timezone_impact').agg({
    'is_win': ['mean', 'sum', 'count'],
    'goal_diff': ['mean', 'std'],
    'nd_goals': 'mean',
    'opp_goals': 'mean'
}).round(3)
print(tz_perf)

print("\n" + "="*70)
print("PERFORMANCE BY TRAVEL TYPE")
print("="*70)
tt_perf = df_clean.groupby('travel_type').agg({
    'is_win': ['mean', 'sum', 'count'],
    'goal_diff': ['mean', 'std'],
    'nd_goals': 'mean',
    'stress_score_default': 'mean'
}).round(3)
print(tt_perf)

## 5. Visualize Travel vs Performance Relationships

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Win percentage by travel intensity
ax = axes[0, 0]
travel_wins = df_clean.groupby('travel_intensity')['is_win'].agg(['mean', 'count'])
travel_wins['mean'].plot(kind='bar', ax=ax, color=['green', 'orange', 'red'])
ax.set_title('Win Percentage by Travel Intensity', fontsize=12, fontweight='bold')
ax.set_ylabel('Win %')
ax.set_xlabel('Travel Intensity')
ax.set_ylim([0, 1])
for i, (idx, row) in enumerate(travel_wins.iterrows()):
    ax.text(i, row['mean'] + 0.02, f"{row['mean']:.1%}", ha='center')
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)

# 2. Goal differential by stress level
ax = axes[0, 1]
stress_goals = df_clean.groupby('stress_level')['goal_diff'].mean()
stress_goals.plot(kind='bar', ax=ax, color=['green', 'yellow', 'red'])
ax.set_title('Avg Goal Differential by Stress Level', fontsize=12, fontweight='bold')
ax.set_ylabel('Goal Differential')
ax.set_xlabel('Stress Level')
ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)

# 3. Scatter: Total Travel Hours vs Goal Differential
ax = axes[1, 0]
scatter = ax.scatter(df_clean['total_travel_hours'], df_clean['goal_diff'], 
                    c=df_clean['is_win'], cmap='RdYlGn', s=100, alpha=0.6)
ax.set_title('Travel Hours vs Performance (Goal Differential)', fontsize=12, fontweight='bold')
ax.set_xlabel('Total Travel Hours')
ax.set_ylabel('Goal Differential')
ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.colorbar(scatter, ax=ax, label='Win (1) / Loss (0)')

# 4. Scatter: Stress Score vs Win Rate
ax = axes[1, 1]
stress_win = df_clean.groupby(pd.cut(df_clean['stress_score_default'], bins=5)).agg({
    'stress_score_default': 'mean',
    'is_win': 'mean'
})
ax.scatter(stress_win['stress_score_default'], stress_win['is_win'], s=200, alpha=0.6, color='blue')
ax.set_title('Stress Score vs Win Rate', fontsize=12, fontweight='bold')
ax.set_xlabel('Average Stress Score')
ax.set_ylabel('Win Rate')
ax.set_ylim([0, 1])

plt.tight_layout()
plt.show()

print("Visualization Complete")

In [None]:
# Additional visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Box plot: Goal difference by travel intensity
ax = axes[0, 0]
df_clean.boxplot(column='goal_diff', by='travel_intensity', ax=ax)
ax.set_title('Goal Differential Distribution by Travel Intensity', fontsize=12, fontweight='bold')
ax.set_xlabel('Travel Intensity')
ax.set_ylabel('Goal Differential')
plt.sca(ax)
plt.xticks(rotation=45)

# 2. Box plot: Shots by travel type
ax = axes[0, 1]
df_clean.boxplot(column='nd_shots_total', by='travel_type', ax=ax)
ax.set_title('Team Shots by Travel Type', fontsize=12, fontweight='bold')
ax.set_xlabel('Travel Type')
ax.set_ylabel('Shots on Goal')
plt.sca(ax)
plt.xticks(rotation=45)

# 3. Stress components by travel type
ax = axes[1, 0]
stress_components = df_clean.groupby('travel_type')[['travel_component', 'timezone_component', 'transfer_component']].mean()
stress_components.plot(kind='bar', ax=ax)
ax.set_title('Stress Components by Travel Type', fontsize=12, fontweight='bold')
ax.set_ylabel('Stress Score Contribution')
ax.set_xlabel('Travel Type')
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)

# 4. Performance by game in trip (game_num)
ax = axes[1, 1]
game_perf = df_clean.groupby('game_num').agg({'is_win': 'mean', 'goal_diff': 'mean'})
ax2 = ax.twinx()
ax.plot(game_perf.index, game_perf['is_win'], 'o-', color='green', linewidth=2, markersize=8, label='Win Rate')
ax2.plot(game_perf.index, game_perf['goal_diff'], 's--', color='blue', linewidth=2, markersize=8, label='Goal Diff')
ax.set_title('Performance by Game Number in Trip', fontsize=12, fontweight='bold')
ax.set_xlabel('Game Number in Trip')
ax.set_ylabel('Win Rate', color='green')
ax2.set_ylabel('Goal Differential', color='blue')
ax.set_ylim([0, 1])
ax.tick_params(axis='y', labelcolor='green')
ax2.tick_params(axis='y', labelcolor='blue')

plt.tight_layout()
plt.show()

## 6. Statistical Testing and Correlation Analysis

In [None]:
# Correlation analysis
print("="*70)
print("CORRELATION ANALYSIS: Travel Factors vs Performance")
print("="*70)

travel_factors = ['total_travel_hours', 'travel_to_hours', 'travel_from_hours',
                  'stress_score_default', 'timezone_shift_hours', 'transfer_hours']
performance_metrics = ['is_win', 'goal_diff', 'nd_goals', 'opp_goals', 'nd_shots_total']

# Create correlation matrix
corr_data = df_clean[travel_factors + performance_metrics].dropna()
correlation_matrix = corr_data[travel_factors].corrwith(corr_data[performance_metrics])

print("\nPearson Correlations (Travel Factors vs Performance):")
print(correlation_matrix.round(3))

# Detailed correlations with p-values
print("\n" + "="*70)
print("DETAILED CORRELATION STATISTICS")
print("="*70)

for factor in travel_factors:
    for metric in performance_metrics:
        valid_data = df_clean[[factor, metric]].dropna()
        if len(valid_data) > 2:
            corr_coef, p_value = pearsonr(valid_data[factor], valid_data[metric])
            significance = "***" if p_value < 0.01 else "**" if p_value < 0.05 else "*" if p_value < 0.1 else "ns"
            print(f"{factor:25} vs {metric:15} | r={corr_coef:7.3f}, p={p_value:.4f} {significance}")

# Statistical tests: Compare high vs low travel
print("\n" + "="*70)
print("T-TESTS: HIGH vs LOW TRAVEL CONDITIONS")
print("="*70)

high_travel = df_clean[df_clean['total_travel_hours'] >= df_clean['total_travel_hours'].median()]
low_travel = df_clean[df_clean['total_travel_hours'] < df_clean['total_travel_hours'].median()]

print(f"\nLow Travel (median: {low_travel['total_travel_hours'].mean():.2f} hrs)")
print(f"  - Win Rate: {low_travel['is_win'].mean():.1%}")
print(f"  - Avg Goal Diff: {low_travel['goal_diff'].mean():.2f}")
print(f"  - Avg Shots: {low_travel['nd_shots_total'].mean():.1f}")

print(f"\nHigh Travel (median: {high_travel['total_travel_hours'].mean():.2f} hrs)")
print(f"  - Win Rate: {high_travel['is_win'].mean():.1%}")
print(f"  - Avg Goal Diff: {high_travel['goal_diff'].mean():.2f}")
print(f"  - Avg Shots: {high_travel['nd_shots_total'].mean():.1f}")

# T-tests
t_stat_wins, p_val_wins = ttest_ind(high_travel['is_win'], low_travel['is_win'])
t_stat_diff, p_val_diff = ttest_ind(high_travel['goal_diff'], low_travel['goal_diff'])
t_stat_shots, p_val_shots = ttest_ind(high_travel['nd_shots_total'], low_travel['nd_shots_total'])

print(f"\nT-Test Results:")
print(f"  Win Rate:      t={t_stat_wins:6.3f}, p={p_val_wins:.4f}")
print(f"  Goal Diff:     t={t_stat_diff:6.3f}, p={p_val_diff:.4f}")
print(f"  Shots on Goal: t={t_stat_shots:6.3f}, p={p_val_shots:.4f}")

## 7. Generate Summary Insights

In [None]:
# Comprehensive Summary Report
print("="*80)
print(" " * 20 + "HOCKEY TRAVEL IMPACT ANALYSIS - SUMMARY REPORT")
print("="*80)

print("\n1. KEY FINDINGS - TRAVEL IMPACT ON WIN RATE")
print("-" * 80)
for intensity in ['Low', 'Medium', 'High']:
    subset = df_clean[df_clean['travel_intensity'] == intensity]
    win_pct = subset['is_win'].mean() * 100
    games = len(subset)
    wins = subset['is_win'].sum()
    print(f"   {intensity:10} Travel: {win_pct:5.1f}% win rate ({int(wins):2}W-{int(games-wins):2}L in {games} games)")

print("\n2. KEY FINDINGS - STRESS LEVEL IMPACT")
print("-" * 80)
for level in ['Low', 'Moderate', 'High']:
    subset = df_clean[df_clean['stress_level'] == level]
    win_pct = subset['is_win'].mean() * 100
    avg_stress = subset['stress_score_default'].mean()
    avg_goal_diff = subset['goal_diff'].mean()
    print(f"   {level:10} Stress:  {win_pct:5.1f}% W, Avg Stress={avg_stress:5.1f}, Goal Diff={avg_goal_diff:+.2f}")

print("\n3. KEY FINDINGS - TIMEZONE IMPACT")
print("-" * 80)
for impact in ['None/Minimal', 'Moderate', 'Significant']:
    subset = df_clean[df_clean['timezone_impact'] == impact]
    if len(subset) > 0:
        win_pct = subset['is_win'].mean() * 100
        avg_tz_shift = np.abs(subset['timezone_shift_hours']).mean()
        print(f"   {impact:15}: {win_pct:5.1f}% win rate (avg {avg_tz_shift:.1f}hr shift)")

print("\n4. KEY FINDINGS - TRAVEL TYPE COMPARISON")
print("-" * 80)
for ttype in ['Flight', 'Bus']:
    subset = df_clean[df_clean['travel_type'] == ttype]
    if len(subset) > 0:
        win_pct = subset['is_win'].mean() * 100
        avg_travel = subset['total_travel_hours'].mean()
        avg_stress = subset['stress_score_default'].mean()
        print(f"   {ttype:10}: {win_pct:5.1f}% W, {avg_travel:5.2f}hrs travel, Stress={avg_stress:5.1f}")

print("\n5. CORRELATION INSIGHTS (Strongest Relations)")
print("-" * 80)
corr_data = df_clean[['total_travel_hours', 'stress_score_default', 'timezone_shift_hours', 'is_win', 'goal_diff']].dropna()
corr_matrix = corr_data.corr()

print("   Correlations with Win Rate (is_win):")
win_corr = corr_matrix['is_win'].sort_values(ascending=False)
for factor, corr_val in win_corr.items():
    if factor != 'is_win':
        print(f"      {factor:25} : {corr_val:7.3f}")

print("\n   Correlations with Goal Differential:")
diff_corr = corr_matrix['goal_diff'].sort_values(ascending=False)
for factor, corr_val in diff_corr.items():
    if factor != 'goal_diff':
        print(f"      {factor:25} : {corr_val:7.3f}")

print("\n6. PERFORMANCE TRENDS")
print("-" * 80)
game_perf = df_clean.groupby('game_num').agg({
    'is_win': 'mean',
    'goal_diff': 'mean',
    'nd_shots_total': 'mean',
    'stress_score_default': 'mean'
})
print("\n   Performance by Game Number in Trip:")
for i, row in game_perf.iterrows():
    print(f"      Game {i}: {row['is_win']:.1%} W | Goal Diff {row['goal_diff']:+.2f} | " + 
          f"Shots {row['nd_shots_total']:5.1f} | Stress {row['stress_score_default']:5.1f}")

print("\n7. STATISTICAL SIGNIFICANCE")
print("-" * 80)
high_travel = df_clean[df_clean['total_travel_hours'] >= df_clean['total_travel_hours'].median()]
low_travel = df_clean[df_clean['total_travel_hours'] < df_clean['total_travel_hours'].median()]

t_stat, p_val = ttest_ind(high_travel['is_win'], low_travel['is_win'])
sig = "YES (p<0.05)" if p_val < 0.05 else "NO (p>0.05)"
print(f"   Travel Hours vs Win Rate - Significant? {sig}")
print(f"      T-statistic: {t_stat:.3f}, P-value: {p_val:.4f}")

t_stat, p_val = ttest_ind(high_travel['goal_diff'], low_travel['goal_diff'])
sig = "YES (p<0.05)" if p_val < 0.05 else "NO (p>0.05)"
print(f"   Travel Hours vs Goal Differential - Significant? {sig}")
print(f"      T-statistic: {t_stat:.3f}, P-value: {p_val:.4f}")

print("\n" + "="*80)
print(" " * 25 + "END OF ANALYSIS REPORT")
print("="*80)