In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from src.data.data_loader import load_nfl_data, preprocess_data
from src.features.feature_engineering import create_features



ModuleNotFoundError: No module named 'src'

In [6]:
# Load and Preprocess Data
df = load_nfl_data('local', local_path='path/to/your/local/data.csv')
df = preprocess_data(df)
df = create_features(df)
df.head()



NameError: name 'load_nfl_data' is not defined

In [None]:
# Data Overview
print(df.info())
print(df.describe())

# Visualize Key Statistics
# Distribution of points scored
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='home_score', label='Home', kde=True)
sns.histplot(data=df, x='away_score', label='Away', kde=True)
plt.title('Distribution of Points Scored')
plt.legend()
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
plt.show()

# Team Performance Analysis
# Average points scored by each team
team_points = pd.concat([
    df.groupby('home_team')['home_score'].mean(),
    df.groupby('away_team')['away_score'].mean()
]).groupby(level=0).mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
team_points.plot(kind='bar')
plt.title('Average Points Scored by Team')
plt.ylabel('Average Points')
plt.xticks(rotation=45)
plt.show()

# Time Series Analysis
# Total points scored over time
df['total_points'] = df['home_score'] + df['away_score']
df_time = df.set_index('date').resample('W')['total_points'].mean()

fig = px.line(df_time, x=df_time.index, y='total_points', title='Average Total Points Scored per Week')
fig.show()

# Feature Importance (Preview)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Assuming 'winner' is our target variable
X = df.drop(['winner', 'date', 'home_team', 'away_team'], axis=1)
y = df['winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance.head(15))
plt.title('Top 15 Most Important Features')
plt.show()

# Home vs Away Performance
home_wins = (df['home_score'] > df['away_score']).mean()
away_wins = (df['away_score'] > df['home_score']).mean()
ties = (df['home_score'] == df['away_score']).mean()

plt.figure(figsize=(10, 6))
plt.pie([home_wins, away_wins, ties], labels=['Home Wins', 'Away Wins', 'Ties'], autopct='%1.1f%%')
plt.title('Home vs Away Win Percentage')
plt.show()

print(f"Home Win Percentage: {home_wins:.2%}")
print(f"Away Win Percentage: {away_wins:.2%}")
print(f"Tie Percentage: {ties:.2%}")

# Scoring Trends
df['year'] = df['date'].dt.year
yearly_scores = df.groupby('year')[['home_score', 'away_score']].mean()

plt.figure(figsize=(12, 6))
yearly_scores.plot(kind='line')
plt.title('Average Scores Over the Years')
plt.ylabel('Average Score')
plt.legend(['Home Score', 'Away Score'])
plt.show()

# Team Performance Home vs Away
home_performance = df.groupby('home_team')['home_score'].mean().sort_values(ascending=False)
away_performance = df.groupby('away_team')['away_score'].mean().sort_values(ascending=False)

performance_diff = home_performance - away_performance

plt.figure(figsize=(12, 8))
performance_diff.sort_values().plot(kind='bar')
plt.title('Difference in Home vs Away Performance (Average Points)')
plt.ylabel('Home Score - Away Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Win Streak Analysis
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='home_win_streak', kde=True)
plt.title('Distribution of Home Team Win Streaks')
plt.xlabel('Win Streak Length')
plt.show()

top_streaks = df.groupby('home_team')['home_win_streak'].max().sort_values(ascending=False).head(10)
print("Top 10 Longest Home Win Streaks:")
print(top_streaks)

# Correlation between Offensive and Defensive Performance
offense_defense_corr = df[['home_points_rolling_avg', 'home_total_yards_rolling_avg',
                           'away_points_rolling_avg', 'away_total_yards_rolling_avg']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(offense_defense_corr, annot=True, cmap='coolwarm')
plt.title('Correlation between Offensive and Defensive Performance')
plt.show()

# Turnover Analysis
df['turnover_diff'] = df['home_turnovers'] - df['away_turnovers']
df['point_diff'] = df['home_score'] - df['away_score']

plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='turnover_diff', y='point_diff', alpha=0.5)
plt.title('Impact of Turnover Differential on Point Differential')
plt.xlabel('Turnover Differential (Home - Away)')
plt.ylabel('Point Differential (Home - Away)')
plt.show()

correlation = df['turnover_diff'].corr(df['point_diff'])
print(f"Correlation between turnover differential and point differential: {correlation:.2f}")