In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [9]:
# Load the datasets
df_2021 = pd.read_csv('2021data.csv')
df_2022 = pd.read_csv('2022data.csv')
df_2023 = pd.read_csv('2023data.csv')

# Combine the datasets
data = pd.concat([df_2021, df_2022, df_2023])

# Save the combined dataset
data.to_csv('combined_dataset.csv', index=False)

In [10]:
X = data.drop('Result', axis=1)
y = data['Result'].apply(lambda x: 1 if x == 'W' else 0)  # Convert Win to 1 and anything else to 0

categorical_columns = X.select_dtypes(include=['object']).columns
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = make_column_transformer(
    (make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'), 
        OneHotEncoder(handle_unknown='ignore')), categorical_columns),
    (SimpleImputer(strategy='mean'), numeric_columns)
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_pipeline = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))

rf_pipeline.fit(X_train, y_train)

score = rf_pipeline.score(X_test, y_test)

score

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['float', 'str']

In [13]:
# Filter the dataset for Detroit Lions games
lions_games = data[data['Team'] == 'Lions']

# For simplicity, we will create a "typical" game for the Lions based on their historical data's mean
typical_lions_game = lions_games[numeric_columns].apply(lambda x: x.mean(), axis=0)
typical_lions_game = typical_lions_game.to_frame().transpose()

# We'll need to handle categorical data separately as we can't just take a mean
# For this example, we'll assume the mode (most common category) for categorical attributes
typical_lions_categorical = lions_games[categorical_columns].mode().iloc[0]

# Combine numerical and categorical "typical" game data
typical_lions_game = pd.concat([typical_lions_game, typical_lions_categorical], axis=1)

# We assume that the 'Result' column (our label) is not present in the new data we want to predict
typical_lions_game = typical_lions_game[X_train.columns]

# Now we use our trained model to make a prediction
# Note: This prediction is speculative and based on the "average" past game features
lions_next_game_pred = rf_pipeline.predict(typical_lions_game)

# Convert prediction back to 'W' or 'L'
lions_next_game_result = 'W' if lions_next_game_pred[0] == 1 else 'L'
lions_next_game_result


IndexError: single positional indexer is out-of-bounds

In [12]:
# Check how many Lions games are available in the dataset
lions_games_count = lions_games.shape[0]

# Check for any columns that might have all null values, which could cause the mode to fail
null_columns = lions_games[categorical_columns].isnull().all()

# Display the number of Lions games and any categorical columns that have only null values
(lions_games_count, null_columns)

NameError: name 'lions_games' is not defined