In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold


In [None]:
train_data = pd.read_csv('train_data.csv')
train_data = train_data.drop(columns=['id'])
test_data = pd.read_csv('2024_test_data (2).csv')

In [None]:
# Check for categorical columns in test_data
categorical_columns = train_data.select_dtypes(include=['object', 'category']).columns.tolist()

# Print the list of categorical columns
print("Categorical Columns in train Data:", categorical_columns)

Categorical Columns in train Data: ['home_team_abbr', 'away_team_abbr', 'date', 'is_night_game', 'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season']


In [None]:
# Check for categorical columns in test_data
categorical_columns_test = test_data.select_dtypes(include=['object', 'category']).columns.tolist()

# Print the list of categorical columns
print("Categorical Columns in Test Data:", categorical_columns_test)

Categorical Columns in Test Data: ['home_team_abbr', 'away_team_abbr', 'is_night_game', 'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season']


In [None]:
train_data['fatigue_factor_home'] = train_data['home_pitcher_rest'] / (train_data['home_team_rest'] + 1)
train_data['fatigue_factor_away'] = train_data['away_pitcher_rest'] / (train_data['away_team_rest'] + 1)
test_data['fatigue_factor_home'] = test_data['home_pitcher_rest'] / (test_data['home_team_rest'] + 1)
test_data['fatigue_factor_away'] = test_data['away_pitcher_rest'] / (test_data['away_team_rest'] + 1)

In [None]:
train_data['home_performance'] = (train_data['home_batting_batting_avg_mean'] - train_data['home_pitching_earned_run_avg_mean'])
train_data['away_performance'] = (train_data['away_batting_batting_avg_mean'] - train_data['away_pitching_earned_run_avg_mean'])
test_data['home_performance'] = (test_data['home_batting_batting_avg_mean'] - test_data['home_pitching_earned_run_avg_mean'])
test_data['away_performance'] = (test_data['away_batting_batting_avg_mean'] - test_data['away_pitching_earned_run_avg_mean'])


In [None]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_columns = [
    'home_team_abbr', 'away_team_abbr', 'is_night_game',
    'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season'
]

# Initialize LabelEncoder
label_encoders = {}



In [None]:
# Apply label encoding to each column
for col in categorical_columns:
    if col in train_data.columns:
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col].astype(str))  # Ensure dtype is string
        label_encoders[col] = le  # Save the encoder for future use
        print(f"Label encoding applied to column: {col}")
    else:
        print(f"Column {col} not found in train_data.")



Label encoding applied to column: home_team_abbr
Label encoding applied to column: away_team_abbr
Label encoding applied to column: is_night_game
Label encoding applied to column: home_pitcher
Label encoding applied to column: away_pitcher
Label encoding applied to column: home_team_season
Label encoding applied to column: away_team_season


In [None]:
# Apply label encoding to each column
for col in categorical_columns:
    if col in test_data.columns:
        le = LabelEncoder()
        test_data[col] = le.fit_transform(test_data[col].astype(str))  # Ensure dtype is string
        label_encoders[col] = le  # Save the encoder for future use
        print(f"Label encoding applied to column: {col}")
    else:
        print(f"Column {col} not found in train_data.")



Label encoding applied to column: home_team_abbr
Label encoding applied to column: away_team_abbr
Label encoding applied to column: is_night_game
Label encoding applied to column: home_pitcher
Label encoding applied to column: away_pitcher
Label encoding applied to column: home_team_season
Label encoding applied to column: away_team_season


In [None]:
train_data['home_team_wins_mean']

Unnamed: 0,home_team_wins_mean
0,0.311179
1,-0.282099
2,-0.273817
3,0.275165
4,0.417835
...,...
11062,0.092905
11063,-1.952311
11064,-1.598944
11065,0.906471


In [None]:
# Scaling parameters (from original data)
min_val = 0  # Replace with actual minimum value
max_val = 1  # Replace with actual maximum value

# Reverse scaling function
def reverse_scaling(scaled_value, min_val, max_val):
    return scaled_value * (max_val - min_val) + min_val

# Reverse scaling for training data
train_data['home_team_wins_mean_original'] = train_data['home_team_wins_mean'].apply(
    lambda x: reverse_scaling(x, min_val, max_val)
)

# Reverse scaling for test data
test_data['home_team_wins_mean_original'] = test_data['home_team_wins_mean'].apply(
    lambda x: reverse_scaling(x, min_val, max_val)
)

In [None]:
train_data['home_team_wins_mean_original']

Unnamed: 0,home_team_wins_mean_original
0,0.311179
1,-0.282099
2,-0.273817
3,0.275165
4,0.417835
...,...
11062,0.092905
11063,-1.952311
11064,-1.598944
11065,0.906471


In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Initialize scalers for the columns that require scaling
scalers = [MinMaxScaler(), MinMaxScaler()]  # One for each column to be scaled


# Fit and transform the training data
train_data['home_team_wins_mean'] = scalers[0].fit_transform(train_data['home_team_wins_mean'].values.reshape(-1, 1))
train_data['away_team_wins_mean'] = scalers[1].fit_transform(train_data['away_team_wins_mean'].values.reshape(-1, 1))

# Transform the test data using the same scalers
test_data['home_team_wins_mean'] = scalers[0].transform(test_data[['home_team_wins_mean']])
test_data['away_team_wins_mean'] = scalers[1].transform(test_data[['away_team_wins_mean']])

# Reverse scaling function (modify if required)
def reverse_scale(df):
    df['home_team_wins_skew'] = df['home_team_wins_skew'] * (df['home_team_wins_std'] / df['home_team_wins_mean'])
    df['away_team_wins_skew'] = df['away_team_wins_skew'] * (df['away_team_wins_std'] / df['away_team_wins_mean'])
    return df

# Apply reverse scaling and deduction
test_data = reverse_scale(test_data)
train_data = reverse_scale(train_data)





In [None]:
train_data['home_team_wins_mean']

Unnamed: 0,home_team_wins_mean
0,0.536585
1,0.468750
2,0.469697
3,0.532468
4,0.548780
...,...
11062,0.511628
11063,0.277778
11064,0.318182
11065,0.604651


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Impute numeric columns
numeric_columns = train_data.select_dtypes(include=[np.number]).columns
imputer = IterativeImputer(random_state=42)
train_data[numeric_columns] = imputer.fit_transform(train_data[numeric_columns])



In [None]:
test_data[numeric_columns] = imputer.transform(test_data[numeric_columns])

In [None]:
print("Missing values in the training data after preprocessing:")
print(train_data.isnull().sum().sum())  # Should be 0
print("\nMissing values in the test data after preprocessing:")
print(test_data.isnull().sum().sum())  # Should be 0

Missing values in the training data after preprocessing:
0

Missing values in the test data after preprocessing:
0


In [None]:
y_train = train_data['home_team_win']
X_train = train_data.drop(columns=['home_team_win', 'date'])  # Drop target and unnecessary columns
X_test = test_data.drop(columns=['id'])  # Drop 'date' column from test data if it exists


In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    scoring='accuracy',
    cv=3,
    n_iter=10,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_selected, y_train)
best_model = random_search.best_estimator_


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = best_model.predict(X_test_selected)

In [None]:
# Create a submission file
submission = pd.DataFrame({
    'id': test_data['id'],  # Ensure the 'id' column exists in the original test data
    'home_team_win': y_pred
})

# Save to CSV
submission.to_csv('submission2.2_stage_2.csv', index=False)
print("Submission file 'submission.csv' created.")