In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install statsbombpy --quiet
!pip install optuna --quiet
!pip install joblib --quiet
!pip install xgboost --quiet
!pip install kaleido --quiet
!pip install mplsoccer --quiet
import warnings
warnings.filterwarnings('ignore')
from statsbombpy import sb

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as matplotlib

In [4]:
competitions_df = sb.competitions()
comp_names = competitions_df['competition_name'].unique()
print(comp_names)
target = {name: competitions_df.loc[competitions_df['competition_name'] == name, ['competition_id', 'season_id']] for name in comp_names}
target_df = pd.concat(target, axis=0, keys=target.keys())



['1. Bundesliga' 'African Cup of Nations' 'Champions League'
 'Copa America' 'Copa del Rey' "FA Women's Super League"
 'FIFA U20 World Cup' 'FIFA World Cup' 'Indian Super league' 'La Liga'
 'Liga Profesional' 'Ligue 1' 'Major League Soccer'
 'North American League' 'NWSL' 'Premier League' 'Serie A' 'UEFA Euro'
 'UEFA Europa League' "UEFA Women's Euro" "Women's World Cup"]


In [5]:
all_matches = []

# Iterate through each row of the target_df DataFrame
for _, row in target_df.droplevel(0).iterrows():  # Use droplevel to remove the multi-index created by pd.concat
    competition_id = row['competition_id']
    season_id = row['season_id']
    print(f'{competition_id} {season_id}')
    try:
        # Fetch matches for the competition and season
        matches = sb.matches(competition_id=int(competition_id), season_id=int(season_id))
        all_matches.append(matches)
    except Exception as e:
        print(f"Error fetching matches for competition_id={competition_id} and season_id={season_id}: {e}")

# Concatenate all matches into a single DataFrame
if all_matches:
    matches_df = pd.concat(all_matches, ignore_index=True)
else:
    matches_df = pd.DataFrame()  # Return an empty DataFrame if no matches are fetched


9 281
9 27
1267 107
16 4
16 1
16 2
16 27
16 26
16 25
16 24
16 23
16 22
16 21
16 41
16 39
16 37
16 44
16 76
16 277
16 71
16 276
223 282
87 84
87 268
87 279
37 90
37 42
37 4
1470 274
43 106
43 3
43 55
43 54
43 51
43 272
43 270
43 269
1238 108
11 90
11 42
11 4
11 1
11 2
11 27
11 26
11 25
11 24
11 23
11 22
11 21
11 41
11 40
11 39
11 38
11 37
11 278
81 48
81 275
7 235
7 108
7 27
44 107
116 68
49 3
2 27
2 44
12 27
12 86
55 282
55 43
35 75
53 106
72 107
72 30


In [6]:
# Fetch event data for a match
def fetch_events(match_id):
    try:
        events = sb.events(match_id=match_id)
        # Filter for shots and select only necessary columns
        shots = events[events['type'] == 'Shot'][['location', 'shot_end_location', 'shot_statsbomb_xg']]
        return shots
    except Exception as e:
        print(f"Error fetching events for match_id={match_id}: {e}")
        return None  # Return None instead of an empty DataFrame on error

# Collect all event data from the matches
all_events = [fetch_events(match_id) for match_id in matches_df['match_id']]

# Filter out any None values (i.e., matches with errors)
all_events = [df for df in all_events if df is not None]

# Concatenate all DataFrames at once (more efficient)
if all_events:
    events_df = pd.concat(all_events, ignore_index=True)
else:
    events_df = pd.DataFrame()  # In case no valid data was fetched

events_df.head()


Unnamed: 0,location,shot_end_location,shot_statsbomb_xg
0,"[100.4, 35.1]","[101.6, 35.2]",0.056644
1,"[114.6, 33.5]","[118.1, 35.7, 0.2]",0.143381
2,"[106.2, 55.8]","[113.4, 46.8]",0.038188
3,"[113.9, 47.4]","[114.1, 46.8]",0.052781
4,"[89.2, 42.5]","[101.4, 41.3]",0.021272


In [7]:
events_df.describe()

Unnamed: 0,shot_statsbomb_xg
count,87111.0
mean,0.106297
std,0.149382
min,0.00018
25%,0.027673
50%,0.05479
75%,0.11006
max,0.995122


In [8]:
events_df.rename(columns={'shot_statsbomb_xg': 'xG'}, inplace=True)

In [9]:
events_df.rename(columns={
        
        'location': 'shot_location',
        'shot_end_location': 'end_location'
        
    }, inplace=True)

In [10]:


def determine_target_goal(shot_location, end_location=None):
    """
    Determine which goal the shot is targeting.
    Returns goal_center, left_post, right_post for the target goal.
    """
    shot_x, shot_y = shot_location[:2]
    
    # Define both goals based on your coordinate system
    # Left goal (at x=0)
    left_goal_center = np.array([0.0, 40.0])
    left_goal_left_post = np.array([0.0, 30.0])   # Top post (y=30)
    left_goal_right_post = np.array([0.0, 50.0])  # Bottom post (y=50)
    
    # Right goal (at x=120)
    right_goal_center = np.array([120.0, 40.0])
    right_goal_left_post = np.array([120.0, 30.0])   # Top post (y=30)
    right_goal_right_post = np.array([120.0, 50.0])  # Bottom post (y=50)
    
    # Calculate distances to both goals
    dist_to_left = np.linalg.norm(left_goal_center - np.array([shot_x, shot_y]))
    dist_to_right = np.linalg.norm(right_goal_center - np.array([shot_x, shot_y]))
    
    # If we have end location, use trajectory to determine target
    if end_location is not None and len(end_location) >= 2:
        end_x, end_y = end_location[:2]
        # Check if ball is moving towards left goal (decreasing x) or right goal (increasing x)
        x_direction = end_x - shot_x
        
        if x_direction < 0:  # Moving towards left goal
            return left_goal_center, left_goal_left_post, left_goal_right_post
        else:  # Moving towards right goal
            return right_goal_center, right_goal_left_post, right_goal_right_post
    else:
        # No end location, use nearest goal
        if dist_to_left < dist_to_right:
            return left_goal_center, left_goal_left_post, left_goal_right_post
        else:
            return right_goal_center, right_goal_left_post, right_goal_right_post

def calculate_distance_to_goal(shot_location, end_location=None):
    """Calculate the Euclidean distance from shot location to the center of the target goal."""
    goal_center, _, _ = determine_target_goal(shot_location, end_location)
    shot_loc = np.array(shot_location[:2])  # Take only x, y from shot_location
    return np.linalg.norm(goal_center - shot_loc)

def calculate_angle_to_goal(shot_location, end_location=None):
    """Calculate the angle between the shot location and the goalposts of the target goal."""
    goal_center, left_post, right_post = determine_target_goal(shot_location, end_location)
    shot_loc = np.array(shot_location[:2])
    
    # Calculate distances
    left_dist = np.linalg.norm(left_post - shot_loc)
    right_dist = np.linalg.norm(right_post - shot_loc)
    goal_width = np.linalg.norm(left_post - right_post)
    
    # Use cosine rule to calculate angle
    if left_dist == 0 or right_dist == 0:
        return 0  # Prevent division by zero for rare edge cases
    
    cos_angle = (left_dist**2 + right_dist**2 - goal_width**2) / (2 * left_dist * right_dist)
    # Clamp to valid range to avoid numerical errors
    cos_angle = np.clip(cos_angle, -1.0, 1.0)
    angle = np.arccos(cos_angle)
    return np.degrees(angle)

def calculate_shot_displacement(start_x, start_y, end_x, end_y):
    """Calculate the displacement between start and end positions."""
    displacement = np.sqrt((end_x - start_x)**2 + (end_y - start_y)**2)
    return displacement

def calculate_shot_trajectory(start_x, start_y, end_x, end_y):
    """Calculate the trajectory angle in degrees."""
    return np.arctan2((end_y - start_y), (end_x - start_x)) * 180 / np.pi

def process_shot_features(events_df):
    """
    Process all shot features for the events dataframe.
    This function handles both goals automatically.
    """
    # Extract starting positions
    events_df['starting_x'] = events_df['shot_location'].apply(lambda loc: loc[0])
    events_df['starting_y'] = events_df['shot_location'].apply(lambda loc: loc[1])
    
    # Extract ending positions (ignoring z if present)
    events_df[['end_x', 'end_y']] = pd.DataFrame(
        events_df['end_location'].apply(lambda loc: loc[:2]).tolist(), 
        index=events_df.index
    )
    
    # Calculate distance and angle to target goal (using both start and end positions)
    events_df['distance_to_goal'] = events_df.apply(
        lambda row: calculate_distance_to_goal(
            [row['starting_x'], row['starting_y']], 
            [row['end_x'], row['end_y']]
        ), axis=1
    )
    
    events_df['angle_to_goal'] = events_df.apply(
        lambda row: calculate_angle_to_goal(
            [row['starting_x'], row['starting_y']], 
            [row['end_x'], row['end_y']]
        ), axis=1
    )
    
    # Calculate shot displacement and trajectory
    events_df['shot_displacement'] = events_df.apply(
        lambda row: calculate_shot_displacement(
            row['starting_x'], row['starting_y'], 
            row['end_x'], row['end_y']
        ), axis=1
    )
    
    events_df['shot_trajectory'] = events_df.apply(
        lambda row: calculate_shot_trajectory(
            row['starting_x'], row['starting_y'], 
            row['end_x'], row['end_y']
        ), axis=1
    )
    
    return events_df

def debug_shot_analysis(shot_location, end_location=None):
    """
    Debug function to show which goal is being targeted and why.
    Useful for validating the goal selection logic.
    """
    shot_x, shot_y = shot_location[:2]
    
    # Calculate distances to both goals
    left_dist = np.linalg.norm(np.array([0.0, 40.0]) - np.array([shot_x, shot_y]))
    right_dist = np.linalg.norm(np.array([120.0, 40.0]) - np.array([shot_x, shot_y]))
    
    goal_center, left_post, right_post = determine_target_goal(shot_location, end_location)
    distance = calculate_distance_to_goal(shot_location, end_location)
    angle = calculate_angle_to_goal(shot_location, end_location)
    
    target_goal = "Left" if goal_center[0] == 0 else "Right"
    
    print(f"Shot Analysis:")
    print(f"  Start: ({shot_x:.1f}, {shot_y:.1f})")
    if end_location:
        end_x, end_y = end_location[:2]
        print(f"  End: ({end_x:.1f}, {end_y:.1f})")
        x_direction = end_x - shot_x
        print(f"  X Direction: {x_direction:.1f} ({'towards left goal' if x_direction < 0 else 'towards right goal'})")
    print(f"  Distance to left goal: {left_dist:.1f}m")
    print(f"  Distance to right goal: {right_dist:.1f}m")
    print(f"  Target goal: {target_goal}")
    print(f"  Distance to target: {distance:.1f}m")
    print(f"  Angle: {angle:.1f}°")
    
    return {
        'target_goal': target_goal,
        'distance': distance,
        'angle': angle,
        'left_goal_distance': left_dist,
        'right_goal_distance': right_dist
    }


print(debug_shot_analysis([11.77, 39.65], [1.17, 37.66]))

Shot Analysis:
  Start: (11.8, 39.6)
  End: (1.2, 37.7)
  X Direction: -10.6 (towards left goal)
  Distance to left goal: 11.8m
  Distance to right goal: 108.2m
  Target goal: Left
  Distance to target: 11.8m
  Angle: 80.7°
{'target_goal': 'Left', 'distance': 11.775202758339237, 'angle': 80.67459776542395, 'left_goal_distance': 11.775202758339237, 'right_goal_distance': 108.23056592294064}


In [11]:
events_df = process_shot_features(events_df)
events_df.head()

Unnamed: 0,shot_location,end_location,xG,starting_x,starting_y,end_x,end_y,distance_to_goal,angle_to_goal,shot_displacement,shot_trajectory
0,"[100.4, 35.1]","[101.6, 35.2]",0.056644,100.4,35.1,101.6,35.2,20.203218,51.827413,1.204159,4.763642
1,"[114.6, 33.5]","[118.1, 35.7, 0.2]",0.143381,114.6,33.5,118.1,35.7,8.450444,104.827355,4.134005,32.152295
2,"[106.2, 55.8]","[113.4, 46.8]",0.038188,106.2,55.8,113.4,46.8,20.978084,39.061877,11.525624,-51.340192
3,"[113.9, 47.4]","[114.1, 46.8]",0.052781,113.9,47.4,114.1,46.8,9.590099,93.765758,0.632456,-71.565051
4,"[89.2, 42.5]","[101.4, 41.3]",0.021272,89.2,42.5,101.4,41.3,30.901294,35.775033,12.258874,-5.617581


In [12]:
events_df.drop('shot_location', axis = 1, inplace = True)
events_df.drop('end_location', axis = 1, inplace = True)
events_df.head()

Unnamed: 0,xG,starting_x,starting_y,end_x,end_y,distance_to_goal,angle_to_goal,shot_displacement,shot_trajectory
0,0.056644,100.4,35.1,101.6,35.2,20.203218,51.827413,1.204159,4.763642
1,0.143381,114.6,33.5,118.1,35.7,8.450444,104.827355,4.134005,32.152295
2,0.038188,106.2,55.8,113.4,46.8,20.978084,39.061877,11.525624,-51.340192
3,0.052781,113.9,47.4,114.1,46.8,9.590099,93.765758,0.632456,-71.565051
4,0.021272,89.2,42.5,101.4,41.3,30.901294,35.775033,12.258874,-5.617581


In [13]:
events_df.to_csv('version1_shots_data.csv')

In [14]:
data =  pd.read_csv('/kaggle/working/version1_shots_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,xG,starting_x,starting_y,end_x,end_y,distance_to_goal,angle_to_goal,shot_displacement,shot_trajectory
0,0,0.056644,100.4,35.1,101.6,35.2,20.203218,51.827413,1.204159,4.763642
1,1,0.143381,114.6,33.5,118.1,35.7,8.450444,104.827355,4.134005,32.152295
2,2,0.038188,106.2,55.8,113.4,46.8,20.978084,39.061877,11.525624,-51.340192
3,3,0.052781,113.9,47.4,114.1,46.8,9.590099,93.765758,0.632456,-71.565051
4,4,0.021272,89.2,42.5,101.4,41.3,30.901294,35.775033,12.258874,-5.617581


In [15]:
data.drop('Unnamed: 0', axis =1 , inplace = True)

In [16]:
zone_boundaries = {
            'horizontal': [0, 20, 40, 60, 80, 100, 120],  # 6 zones
            'vertical': [0, 26.67, 53.33, 80]  # 3 zones
}
        
# Special zones of interest
zone_14_left = {'x': (20, 40), 'y': (26.67, 53.33)}  # Left Zone 14
zone_14_right = {'x': (80, 100), 'y': (26.67, 53.33)}  # Right Zone 14

# Goal definitions
left_goal = {'center': [0, 40], 'posts': [[0, 30], [0, 50]]}
right_goal = {'center': [120, 40], 'posts': [[120, 30], [120, 50]]}
        
def assign_tactical_zone(x, y):
    """Assign tactical zone (1-18) based on x,y coordinates"""
    # Determine horizontal zone (1-6)
    h_zone = 1
    for i, boundary in enumerate(zone_boundaries['horizontal'][1:], 1):
        if x <= boundary:
            h_zone = i
            break
    
    # Determine vertical zone (1-3)
    v_zone = 1
    for i, boundary in enumerate(zone_boundaries['vertical'][1:], 1):
        if y <= boundary:
            v_zone = i
            break
    
    # Calculate final zone (1-18)
    zone = (v_zone - 1) * 6 + h_zone
    return zone

def is_zone_14( x, y):
    """Check if position is in Zone 14 (either side)"""
    left_14 = (zone_14_left['x'][0] <= x <= zone_14_left['x'][1] and 
              zone_14_left['y'][0] <= y <= zone_14_left['y'][1])
    right_14 = (zone_14_right['x'][0] <= x <= zone_14_right['x'][1] and 
               zone_14_right['y'][0] <= y <= zone_14_right['y'][1])
    return left_14 or right_14
    
def calculate_advanced_features(df):
    """Calculate advanced tactical and spatial features"""
    enhanced_df = df.copy()
    
    # Basic zone assignment
    enhanced_df['tactical_zone'] = enhanced_df.apply(
        lambda row: assign_tactical_zone(row['starting_x'], row['starting_y']), axis=1
    )
    
    # Zone 14 indicator
    enhanced_df['is_zone_14'] = enhanced_df.apply(
        lambda row: is_zone_14(row['starting_x'], row['starting_y']), axis=1
    )
    
    # Central corridor (middle third vertically)
    enhanced_df['is_central_corridor'] = (
        (enhanced_df['starting_y'] >= 26.67) & 
        (enhanced_df['starting_y'] <= 53.33)
    ).astype(int)
    
    # Penalty box features
    enhanced_df['is_in_left_penalty_box'] = (
        (enhanced_df['starting_x'] <= 18) & 
        (enhanced_df['starting_y'] >= 18) & 
        (enhanced_df['starting_y'] <= 62)
    ).astype(int)
    
    enhanced_df['is_in_right_penalty_box'] = (
        (enhanced_df['starting_x'] >= 102) & 
        (enhanced_df['starting_y'] >= 18) & 
        (enhanced_df['starting_y'] <= 62)
    ).astype(int)
        
    # Six-yard box features
    enhanced_df['is_in_left_six_yard'] = (
        (enhanced_df['starting_x'] <= 6) & 
        (enhanced_df['starting_y'] >= 30) & 
        (enhanced_df['starting_y'] <= 50)
    ).astype(int)
    
    enhanced_df['is_in_right_six_yard'] = (
        (enhanced_df['starting_x'] >= 114) & 
        (enhanced_df['starting_y'] >= 30) & 
        (enhanced_df['starting_y'] <= 50)
    ).astype(int)
    
    # Distance to penalty spot
    enhanced_df['distance_to_left_penalty_spot'] = np.sqrt(
        (enhanced_df['starting_x'] - 12)**2 + 
        (enhanced_df['starting_y'] - 40)**2
    )
    
    enhanced_df['distance_to_right_penalty_spot'] = np.sqrt(
        (enhanced_df['starting_x'] - 108)**2 + 
        (enhanced_df['starting_y'] - 40)**2
    )
        
    # Wing position indicators
    enhanced_df['is_left_wing'] = (enhanced_df['starting_y'] <= 26.67).astype(int)
    enhanced_df['is_right_wing'] = (enhanced_df['starting_y'] >= 53.33).astype(int)
    
    # Shot precision (how close to goal center the shot ended)
    enhanced_df['shot_precision_left'] = np.sqrt(
        (enhanced_df['end_x'] - 0)**2 + 
        (enhanced_df['end_y'] - 40)**2
    )
    
    enhanced_df['shot_precision_right'] = np.sqrt(
        (enhanced_df['end_x'] - 120)**2 + 
        (enhanced_df['end_y'] - 40)**2
    )
    
    # Goal mouth targeting (how well centered the shot was)
    enhanced_df['goal_mouth_accuracy'] = enhanced_df.apply(
        calculate_goal_mouth_accuracy, axis=1
    )
    
    # Shot power proxy (displacement/time would be better, but using displacement)
    enhanced_df['shot_power_proxy'] = enhanced_df['shot_displacement']
    
    # Angle quality (normalized angle - better angles get higher scores)
    max_angle = enhanced_df['angle_to_goal'].max()
    enhanced_df['angle_quality'] = enhanced_df['angle_to_goal'] / max_angle
    
    # Distance quality (inverse relationship - closer is better)
    max_distance = enhanced_df['distance_to_goal'].max()
    enhanced_df['distance_quality'] = 1 - (enhanced_df['distance_to_goal'] / max_distance)
        
    # Combined position quality score
    enhanced_df['position_quality'] = (
        enhanced_df['angle_quality'] * 0.4 + 
        enhanced_df['distance_quality'] * 0.4 + 
        enhanced_df['is_zone_14'].astype(int) * 0.2
    )
    
    return enhanced_df
    
def calculate_goal_mouth_accuracy(row):
    """Calculate how accurately the shot was aimed at goal center"""
    # Determine target goal based on shot direction
    if row['end_x'] < row['starting_x']:  # Shooting towards left goal
        target_center_y = 40
    else:  # Shooting towards right goal
        target_center_y = 40
    
    # Distance from goal center (vertically)
    accuracy = abs(row['end_y'] - target_center_y)
    return accuracy
    
def create_zone_heatmap(df, save_path='tactical_zones_heatmap.png'):
    """Create heatmap showing xG by tactical zone"""
    zone_xg = df.groupby('tactical_zone')['xG'].agg(['mean', 'count']).reset_index()
    
    # Create 3x6 grid for zones
    heatmap_data = np.zeros((3, 6))
    
    for _, row in zone_xg.iterrows():
        zone = int(row['tactical_zone'])
        if row['count'] >= 5:  # Only show zones with sufficient data
            v_idx = (zone - 1) // 6
            h_idx = (zone - 1) % 6
            heatmap_data[v_idx, h_idx] = row['mean']
    
    plt.figure(figsize=(12, 6))
    sns.heatmap(heatmap_data, annot=True, cmap='YlOrRd', 
               xticklabels=[f'H{i+1}' for i in range(6)],
               yticklabels=[f'V{i+1}' for i in range(3)])
    plt.title('Average xG by Tactical Zone')
    plt.xlabel('Horizontal Zones (Left to Right)')
    plt.ylabel('Vertical Zones (Top to Bottom)')
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    

print("Starting enhanced feature engineering...")

# Calculate advanced features
enhanced_df = calculate_advanced_features(data)
        
# Create interaction features
enhanced_df['distance_angle_interaction'] = (
    enhanced_df['distance_to_goal'] * enhanced_df['angle_to_goal']
)

enhanced_df['zone14_distance_interaction'] = (
    enhanced_df['is_zone_14'].astype(int) * enhanced_df['distance_to_goal']
)

enhanced_df['penalty_box_angle'] = (
    (enhanced_df['is_in_left_penalty_box'] + enhanced_df['is_in_right_penalty_box']) * 
    enhanced_df['angle_to_goal']
)
        
# Shot trajectory categorization
enhanced_df['shot_direction_category'] = pd.cut(
    enhanced_df['shot_trajectory'], 
    bins=[-180, -90, -30, 30, 90, 180], 
    labels=['Sharp_Left', 'Left', 'Center', 'Right', 'Sharp_Right']
)

# One-hot encode categorical features
direction_dummies = pd.get_dummies(enhanced_df['shot_direction_category'], prefix='direction')
enhanced_df = pd.concat([enhanced_df, direction_dummies], axis=1)

print(f"Feature engineering complete. Dataset shape: {enhanced_df.shape}")
       

create_zone_heatmap(enhanced_df)
    
# Select features for modeling
feature_cols = [
    # Original features
    'distance_to_goal', 'angle_to_goal', 'starting_x', 'starting_y', 
    'end_x', 'end_y', 'shot_displacement', 'shot_trajectory',
    
    # New tactical features
    'tactical_zone', 'is_zone_14', 'is_central_corridor',
    'is_in_left_penalty_box', 'is_in_right_penalty_box',
    'is_in_left_six_yard', 'is_in_right_six_yard',
    'distance_to_left_penalty_spot', 'distance_to_right_penalty_spot',
    'is_left_wing', 'is_right_wing',
    'goal_mouth_accuracy', 'angle_quality', 'distance_quality',
    'position_quality',
    
    # Interaction features
    'distance_angle_interaction', 'zone14_distance_interaction',
    'penalty_box_angle',
    
    # Direction features
    'direction_Center', 'direction_Left', 'direction_Right', 
    'direction_Sharp_Left', 'direction_Sharp_Right'
]

target_col = 'xG'
print(enhanced_df.head())
# Handle missing columns
available_features = [col for col in feature_cols if col in enhanced_df.columns]
print(f"Using {len(available_features)} features for modeling")

features = enhanced_df[available_features]
Y = enhanced_df[target_col]



Starting enhanced feature engineering...
Feature engineering complete. Dataset shape: (87111, 36)
         xG  starting_x  starting_y  end_x  end_y  distance_to_goal  \
0  0.056644       100.4        35.1  101.6   35.2         20.203218   
1  0.143381       114.6        33.5  118.1   35.7          8.450444   
2  0.038188       106.2        55.8  113.4   46.8         20.978084   
3  0.052781       113.9        47.4  114.1   46.8          9.590099   
4  0.021272        89.2        42.5  101.4   41.3         30.901294   

   angle_to_goal  shot_displacement  shot_trajectory  tactical_zone  ...  \
0      51.827413           1.204159         4.763642             12  ...   
1     104.827355           4.134005        32.152295             12  ...   
2      39.061877          11.525624       -51.340192             18  ...   
3      93.765758           0.632456       -71.565051             12  ...   
4      35.775033          12.258874        -5.617581             11  ...   

   position_qualit

In [17]:
enhanced_df.to_csv('enhanced_version1_shots_data.csv')

In [18]:
!pip install kaleido --quiet

In [19]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.express as px
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF
import warnings
warnings.filterwarnings('ignore')

# High-quality plot configuration
PLOT_CONFIG = {
    'toImageButtonOptions': {
        'format': 'png',
        'filename': 'plot',
        'height': 1200,
        'width': 1200,
        'scale': 3  # High DPI
    }
}

# Modern color palette
COLORS = {
    'primary': '#6366f1',      # Indigo
    'secondary': '#ec4899',    # Pink
    'tertiary': '#10b981',     # Emerald
    'background': '#f8fafc',   # Slate-50
    'text': '#1e293b',         # Slate-800
    'light': '#e2e8f0',       # Slate-200
    'accent': '#f59e0b'        # Amber
}

def safe_kde(data, x_range=None):
    """
    Safely create KDE, handling edge cases that cause LinAlgError.
    """
    try:
        # Check if data has sufficient variance
        if len(np.unique(data)) < 2 or np.std(data) < 1e-10:
            # If data is constant or nearly constant, return uniform distribution
            if x_range is None:
                x_range = np.linspace(data.min() - 0.1, data.max() + 0.1, 100)
            return x_range, np.ones_like(x_range) / len(x_range)
        
        # Try to create KDE
        kde = stats.gaussian_kde(data)
        if x_range is None:
            data_range = data.max() - data.min()
            padding = data_range * 0.1 if data_range > 0 else 1
            x_range = np.linspace(data.min() - padding, data.max() + padding, 200)
        
        y_kde = kde(x_range)
        return x_range, y_kde
        
    except (np.linalg.LinAlgError, ValueError):
        # Fallback: create a simple histogram-like density
        if x_range is None:
            x_range = np.linspace(data.min() - 0.1, data.max() + 0.1, 100)
        
        hist, bin_edges = np.histogram(data, bins=min(30, len(np.unique(data))))
        # Interpolate histogram to x_range
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
        y_kde = np.interp(x_range, bin_centers, hist)
        y_kde = y_kde / np.trapz(y_kde, x_range)  # Normalize
        
        return x_range, y_kde

def create_comprehensive_plot(data, column):
    """
    Create a comprehensive, high-quality plot for a single variable.
    """
    col_data = data[column].dropna()
    
    if len(col_data) == 0:
        print(f"Warning: No valid data for column '{column}'")
        return None
    
    # Create subplots with improved layout
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=[
            '📊 Distribution & Density', '📦 Box Plot Analysis',
            '🌧️ Raincloud Visualization', '📈 Cumulative Distribution',
            '🎯 Normality Assessment', '📋 Statistical Summary'
        ],
        specs=[
            [{'type': 'xy'}, {'type': 'xy'}],
            [{'type': 'xy'}, {'type': 'xy'}],
            [{'type': 'xy'}, {'type': 'table'}]
        ],
        vertical_spacing=0.08,
        horizontal_spacing=0.1
    )

    # 1. Enhanced Histogram + KDE
    try:
        # Create histogram
        fig.add_trace(
            go.Histogram(
                x=col_data,
                name='Distribution',
                nbinsx=min(50, max(10, len(np.unique(col_data)))),
                marker=dict(
                    color=COLORS['light'],
                    line=dict(color=COLORS['primary'], width=1)
                ),
                opacity=0.7,
                histnorm='probability density'
            ),
            row=1, col=1
        )
        
        # Add KDE if possible
        x_range, y_kde = safe_kde(col_data)
        fig.add_trace(
            go.Scatter(
                x=x_range,
                y=y_kde,
                mode='lines',
                name='Density Curve',
                line=dict(color=COLORS['primary'], width=3),
                fill='tonexty',
                fillcolor=f"rgba(99, 102, 241, 0.1)"
            ),
            row=1, col=1
        )
        
    except Exception as e:
        print(f"Warning: Could not create KDE for {column}: {e}")
        # Fallback to histogram only
        fig.add_trace(
            go.Histogram(
                x=col_data,
                name='Distribution',
                marker=dict(color=COLORS['primary']),
                opacity=0.7
            ),
            row=1, col=1
        )

    # 2. Enhanced Box Plot
    fig.add_trace(
        go.Box(
            y=col_data,
            name=column,
            marker=dict(color=COLORS['secondary']),
            line=dict(color=COLORS['text']),
            boxmean='sd',  # Show mean and standard deviation
            notched=True,
            boxpoints='outliers'
        ),
        row=1, col=2
    )

    # 3. Raincloud Plot
    try:
        x_kde, y_kde = safe_kde(col_data)
        
        # Mirrored KDE (cloud part)
        fig.add_trace(
            go.Scatter(
                x=x_kde,
                y=y_kde,
                mode='lines',
                name='Upper Density',
                line=dict(color=COLORS['tertiary'], width=2),
                fill='tozeroy',
                fillcolor=f"rgba(16, 185, 129, 0.3)",
                showlegend=False
            ),
            row=2, col=1
        )
        
        fig.add_trace(
            go.Scatter(
                x=x_kde,
                y=-y_kde,
                mode='lines',
                name='Lower Density',
                line=dict(color=COLORS['tertiary'], width=2),
                fill='tozeroy',
                fillcolor=f"rgba(16, 185, 129, 0.3)",
                showlegend=False
            ),
            row=2, col=1
        )
        
        # Rain (jittered points)
        n_points = min(1000, len(col_data))  # Limit points for performance
        if len(col_data) > n_points:
            sample_idx = np.random.choice(len(col_data), n_points, replace=False)
            sample_data = col_data.iloc[sample_idx]
        else:
            sample_data = col_data
            
        jitter = np.random.normal(0, max(y_kde) * 0.05, size=len(sample_data))
        
        fig.add_trace(
            go.Scatter(
                x=sample_data,
                y=jitter,
                mode='markers',
                name='Data Points',
                marker=dict(
                    color=COLORS['accent'],
                    size=4,
                    opacity=0.6,
                    line=dict(width=0.5, color=COLORS['text'])
                ),
                showlegend=False
            ),
            row=2, col=1
        )
        
    except Exception as e:
        print(f"Warning: Could not create raincloud plot for {column}: {e}")
        # Fallback to simple scatter
        fig.add_trace(
            go.Scatter(
                x=col_data,
                y=np.zeros_like(col_data),
                mode='markers',
                name='Data Points',
                marker=dict(color=COLORS['primary'], size=4)
            ),
            row=2, col=1
        )

    # 4. ECDF
    ecdf = ECDF(col_data)
    fig.add_trace(
        go.Scatter(
            x=ecdf.x,
            y=ecdf.y,
            mode='lines',
            name='ECDF',
            line=dict(color=COLORS['primary'], width=3),
            fill='tonexty',
            fillcolor=f"rgba(99, 102, 241, 0.1)"
        ),
        row=2, col=2
    )

    # 5. Q-Q Plot for normality assessment
    try:
        qq = stats.probplot(col_data, dist="norm")
        
        # Data points
        fig.add_trace(
            go.Scatter(
                x=qq[0][0],
                y=qq[0][1],
                mode='markers',
                name='Observed',
                marker=dict(
                    color=COLORS['secondary'],
                    size=6,
                    opacity=0.7,
                    line=dict(width=1, color=COLORS['text'])
                ),
                showlegend=False
            ),
            row=3, col=1
        )
        
        # Reference line
        fig.add_trace(
            go.Scatter(
                x=qq[0][0],
                y=qq[1][0] * qq[0][0] + qq[1][1],
                mode='lines',
                name='Expected Normal',
                line=dict(color=COLORS['accent'], width=3, dash='dash'),
                showlegend=False
            ),
            row=3, col=1
        )
        
    except Exception as e:
        print(f"Warning: Could not create Q-Q plot for {column}: {e}")

    # 6. Statistical Summary Table
    stats_summary = col_data.describe()
    skewness = stats.skew(col_data)
    kurtosis = stats.kurtosis(col_data)
    
    # Calculate additional statistics
    cv = stats_summary['std'] / stats_summary['mean'] if stats_summary['mean'] != 0 else 0
    range_val = stats_summary['max'] - stats_summary['min']
    
    # Outlier detection
    Q1, Q3 = stats_summary['25%'], stats_summary['75%']
    IQR = Q3 - Q1
    outliers = col_data[(col_data < Q1 - 1.5 * IQR) | (col_data > Q3 + 1.5 * IQR)]
    
    # Create summary table
    summary_data = [
        ['Count', f"{len(col_data):,}"],
        ['Mean', f"{stats_summary['mean']:.3f}"],
        ['Median', f"{stats_summary['50%']:.3f}"],
        ['Std Dev', f"{stats_summary['std']:.3f}"],
        ['Min', f"{stats_summary['min']:.3f}"],
        ['Max', f"{stats_summary['max']:.3f}"],
        ['Range', f"{range_val:.3f}"],
        ['Coeff. of Var.', f"{cv:.3f}"],
        ['Skewness', f"{skewness:.3f}"],
        ['Kurtosis', f"{kurtosis:.3f}"],
        ['Outliers', f"{len(outliers)} ({len(outliers)/len(col_data)*100:.1f}%)"],
        ['Normality', interpret_normality(col_data)]
    ]
    
    fig.add_trace(
        go.Table(
            header=dict(
                values=['<b>Statistic</b>', '<b>Value</b>'],
                fill_color=COLORS['primary'],
                font=dict(color='white', size=12),
                align='left'
            ),
            cells=dict(
                values=list(zip(*summary_data)),
                fill_color=[COLORS['background'], 'white'],
                font=dict(color=COLORS['text'], size=11),
                align='left'
            )
        ),
        row=3, col=2
    )

    # Update layout with modern styling
    fig.update_layout(
        height=1200,
        width=1200,
        title=dict(
            text=f"<b>Comprehensive Analysis: {column}</b>",
            font=dict(size=20, color=COLORS['text']),
            x=0.5
        ),
        showlegend=False,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family="Arial, sans-serif", color=COLORS['text'])
    )
    
    # Update axes styling
    for i in range(1, 6):  # 5 subplot axes
        fig.update_xaxes(
            showgrid=True,
            gridwidth=1,
            gridcolor=COLORS['light'],
            zeroline=False,
            row=(i-1)//2 + 1,
            col=(i-1)%2 + 1
        )
        fig.update_yaxes(
            showgrid=True,
            gridwidth=1,
            gridcolor=COLORS['light'],
            zeroline=False,
            row=(i-1)//2 + 1,
            col=(i-1)%2 + 1
        )

    return fig

def interpret_normality(data):
    """Assess normality using Shapiro-Wilk test."""
    if len(data) > 5000:
        # Use Kolmogorov-Smirnov for large samples
        _, p_value = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
        test_name = "K-S"
    else:
        # Use Shapiro-Wilk for smaller samples
        _, p_value = stats.shapiro(data)
        test_name = "S-W"
    
    if p_value > 0.05:
        return f"Normal ({test_name} p={p_value:.3f})"
    else:
        return f"Non-normal ({test_name} p={p_value:.3f})"

def create_correlation_heatmap(data, columns):
    """Create an enhanced correlation heatmap."""
    # Calculate correlation matrix
    corr_matrix = data[columns].corr()
    
    # Create mask for upper triangle
    mask = np.triu(np.ones_like(corr_matrix))
    corr_matrix_masked = corr_matrix.mask(mask.astype(bool))
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        colorscale=[
            [0, '#d73027'],      # Strong negative
            [0.25, '#fc8d59'],   # Moderate negative
            [0.5, '#ffffbf'],    # Neutral
            [0.75, '#91bfdb'],   # Moderate positive
            [1, '#4575b4']       # Strong positive
        ],
        zmin=-1, zmax=1,
        text=np.round(corr_matrix.values, 2),
        texttemplate="%{text}",
        textfont=dict(size=10, color='black'),
        hoverongaps=False,
        hovertemplate='<b>%{y}</b> vs <b>%{x}</b><br>Correlation: %{z:.3f}<extra></extra>'
    ))
    
    fig.update_layout(
        title=dict(
            text='<b>Feature Correlation Matrix</b>',
            font=dict(size=18, color=COLORS['text']),
            x=0.5
        ),
        height=600,
        width=700,
        xaxis=dict(
            tickangle=45,
            side='bottom',
            title='Features'
        ),
        yaxis=dict(
            title='Features',
            autorange='reversed'
        ),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    
    return fig

def analyze_dataset(data, columns_to_analyze=None):
    """
    Analyze entire dataset with improved error handling and performance.
    """
    if columns_to_analyze is None:
        columns_to_analyze = data.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f"🔍 Analyzing {len(columns_to_analyze)} numerical columns...")
    
    successful_plots = 0
    failed_plots = []
    
    # Analyze each column
    for i, column in enumerate(columns_to_analyze, 1):
        try:
            print(f"📊 Processing {column} ({i}/{len(columns_to_analyze)})...")
            fig = create_comprehensive_plot(data, column)
            
            if fig:
                # Save with high quality
                filename = f"{column.replace('/', '_').replace(' ', '_')}_analysis.png"
                fig.write_image(filename, width=1200, height=1200, scale=3)
                successful_plots += 1
                print(f"✅ Saved: {filename}")
            else:
                failed_plots.append(column)
                
        except Exception as e:
            print(f"❌ Error analyzing {column}: {str(e)}")
            failed_plots.append(column)
    
    # Create correlation heatmap
    try:
        print("📈 Creating correlation heatmap...")
        corr_fig = create_correlation_heatmap(data, columns_to_analyze)
        corr_fig.write_image("correlation_heatmap.png", width=700, height=600, scale=3)
        print("✅ Saved: correlation_heatmap.png")
    except Exception as e:
        print(f"❌ Error creating correlation heatmap: {str(e)}")
    
    # Summary
    print(f"\n📋 Analysis Summary:")
    print(f"✅ Successfully analyzed: {successful_plots} columns")
    if failed_plots:
        print(f"❌ Failed to analyze: {len(failed_plots)} columns")
        print(f"   Failed columns: {', '.join(failed_plots)}")
    
    return successful_plots, failed_plots


successful, failed = analyze_dataset(enhanced_df)

🔍 Analyzing 29 numerical columns...
📊 Processing xG (1/29)...
✅ Saved: xG_analysis.png
📊 Processing starting_x (2/29)...
✅ Saved: starting_x_analysis.png
📊 Processing starting_y (3/29)...
✅ Saved: starting_y_analysis.png
📊 Processing end_x (4/29)...
✅ Saved: end_x_analysis.png
📊 Processing end_y (5/29)...
✅ Saved: end_y_analysis.png
📊 Processing distance_to_goal (6/29)...
✅ Saved: distance_to_goal_analysis.png
📊 Processing angle_to_goal (7/29)...
✅ Saved: angle_to_goal_analysis.png
📊 Processing shot_displacement (8/29)...
✅ Saved: shot_displacement_analysis.png
📊 Processing shot_trajectory (9/29)...
✅ Saved: shot_trajectory_analysis.png
📊 Processing tactical_zone (10/29)...
✅ Saved: tactical_zone_analysis.png
📊 Processing is_central_corridor (11/29)...
✅ Saved: is_central_corridor_analysis.png
📊 Processing is_in_left_penalty_box (12/29)...
✅ Saved: is_in_left_penalty_box_analysis.png
📊 Processing is_in_right_penalty_box (13/29)...
✅ Saved: is_in_right_penalty_box_analysis.png
📊 Process

In [20]:
enhanced_df.columns

Index(['xG', 'starting_x', 'starting_y', 'end_x', 'end_y', 'distance_to_goal',
       'angle_to_goal', 'shot_displacement', 'shot_trajectory',
       'tactical_zone', 'is_zone_14', 'is_central_corridor',
       'is_in_left_penalty_box', 'is_in_right_penalty_box',
       'is_in_left_six_yard', 'is_in_right_six_yard',
       'distance_to_left_penalty_spot', 'distance_to_right_penalty_spot',
       'is_left_wing', 'is_right_wing', 'shot_precision_left',
       'shot_precision_right', 'goal_mouth_accuracy', 'shot_power_proxy',
       'angle_quality', 'distance_quality', 'position_quality',
       'distance_angle_interaction', 'zone14_distance_interaction',
       'penalty_box_angle', 'shot_direction_category', 'direction_Sharp_Left',
       'direction_Left', 'direction_Center', 'direction_Right',
       'direction_Sharp_Right'],
      dtype='object')

In [21]:
# features = [col for col in enhanced_df.columns if col != 'xG']
# Y = 'xG'

In [22]:
# print(Y)

In [23]:
# features

In [24]:
!pip install lightgbm --config-settings=cmake.define.USE_CUDA=ON
!pip install xgboost[gpu]



In [25]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.model_selection import train_test_split, cross_val_score, KFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.inspection import permutation_importance
# from sklearn.metrics import make_scorer
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# import json
# import joblib
# import optuna
# import numpy as np
# import pandas as pd

# def check_gpu_availability():
#     """Check if GPU is available for XGBoost and LightGBM"""
#     print("Checking GPU availability...")
    
#     # Check XGBoost GPU support
#     try:
#         import xgboost as xgb
#         xgb_gpu = xgb.XGBRegressor(device='cuda', n_estimators=1)
#         print("✓ XGBoost GPU support available")
#         xgb_device = 'cuda'
#     except:
#         print("✗ XGBoost GPU support not available, using CPU")
#         xgb_device = 'cpu'
    
#     # Check LightGBM GPU support
#     try:
#         import lightgbm as lgb
#         lgb_gpu = lgb.LGBMRegressor(device='gpu', n_estimators=1)
#         print("✓ LightGBM GPU support available")
#         lgb_device = 'gpu'
#     except:
#         print("✗ LightGBM GPU support not available, using CPU")
#         lgb_device = 'cpu'
    
#     return xgb_device, lgb_device

# def prepare_data(data, features, target):
#     """
#     Prepare data for training with proper feature selection, encoding, and validation
#     """
#     from sklearn.preprocessing import LabelEncoder
    
#     print(f"Data shape: {data.shape}")
#     print(f"Features type: {type(features)}")
#     print(f"Target type: {type(target)}")
    
#     # Convert features to list if it's not already
#     if isinstance(features, pd.DataFrame):
#         feature_list = features.columns.tolist()
#         print("Features was a DataFrame, using column names")
#     elif isinstance(features, pd.Series):
#         feature_list = features.tolist()
#         print("Features was a Series, converting to list")
#     elif isinstance(features, (list, tuple)):
#         feature_list = list(features)
#         print("Features is already a list/tuple")
#     else:
#         raise ValueError(f"Features must be a list, tuple, Series, or DataFrame. Got {type(features)}")
    
#     # Convert target to string if it's not already
#     if isinstance(target, pd.Series):
#         target_name = target.name if target.name else str(target.iloc[0])
#         print(f"Target was a Series, using name: {target_name}")
#     elif isinstance(target, (list, tuple)):
#         target_name = target[0] if len(target) == 1 else str(target)
#         print(f"Target was a list/tuple, using: {target_name}")
#     else:
#         target_name = str(target)
#         print(f"Target converted to string: {target_name}")
    
#     # Validate features exist in data
#     missing_features = [f for f in feature_list if f not in data.columns]
#     if missing_features:
#         print(f"Missing features in data: {missing_features}")
#         print(f"Available columns: {list(data.columns)}")
#         raise ValueError(f"Features not found in data: {missing_features}")
    
#     # Validate target exists in data
#     if target_name not in data.columns:
#         print(f"Target '{target_name}' not found in data")
#         print(f"Available columns: {list(data.columns)}")
#         raise ValueError(f"Target '{target_name}' not found in data")
    
#     print(f"Using {len(feature_list)} features: {feature_list[:5]}..." if len(feature_list) > 5 else f"Using features: {feature_list}")
#     print(f"Using target: {target_name}")
    
#     # Select features and target
#     X = data[feature_list].copy()
#     y = data[target_name].copy()
    
#     # Check data types and identify categorical columns
#     print("\nData type analysis:")
#     categorical_columns = []
#     numeric_columns = []
    
#     for col in X.columns:
#         dtype = X[col].dtype
#         unique_vals = X[col].nunique()
#         print(f"{col}: {dtype}, unique values: {unique_vals}")
        
#         if dtype == 'object' or dtype == 'category':
#             categorical_columns.append(col)
#         elif dtype in ['int64', 'float64', 'bool']:
#             numeric_columns.append(col)
#         else:
#             # Handle other dtypes case by case
#             if X[col].dtype.name.startswith('int') or X[col].dtype.name.startswith('float'):
#                 numeric_columns.append(col)
#             else:
#                 categorical_columns.append(col)
    
#     print(f"\nCategorical columns ({len(categorical_columns)}): {categorical_columns}")
#     print(f"Numeric columns ({len(numeric_columns)}): {numeric_columns}")
    
#     # Encode categorical variables
#     label_encoders = {}
#     if categorical_columns:
#         print("\nEncoding categorical variables...")
#         for col in categorical_columns:
#             print(f"Encoding {col}: {list(X[col].unique())}")
#             le = LabelEncoder()
#             X[col] = le.fit_transform(X[col].astype(str))
#             label_encoders[col] = le
    
#     # Check for missing values
#     if X.isnull().sum().sum() > 0:
#         print("Warning: Missing values found in features")
#         print(X.isnull().sum()[X.isnull().sum() > 0])
#         # Fill missing values with median for numeric, mode for categorical
#         for col in X.columns:
#             if X[col].isnull().sum() > 0:
#                 if col in categorical_columns:
#                     X[col].fillna(X[col].mode()[0], inplace=True)
#                 else:
#                     X[col].fillna(X[col].median(), inplace=True)
    
#     if y.isnull().sum() > 0:
#         print(f"Warning: {y.isnull().sum()} missing values found in target")
#         # Remove rows with missing target values
#         mask = ~y.isnull()
#         X = X[mask]
#         y = y[mask]
#         print(f"Removed {(~mask).sum()} rows with missing target values")
    
#     # Ensure all data is numeric
#     print("\nFinal data type check:")
#     for col in X.columns:
#         if not pd.api.types.is_numeric_dtype(X[col]):
#             print(f"Warning: {col} is still not numeric, converting...")
#             X[col] = pd.to_numeric(X[col], errors='coerce')
#             if X[col].isnull().sum() > 0:
#                 X[col].fillna(X[col].median(), inplace=True)
    
#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
#     print(f"\nData shapes after split:")
#     print(f"X_train: {X_train.shape}")
#     print(f"X_test: {X_test.shape}")
#     print(f"y_train: {y_train.shape}")
#     print(f"y_test: {y_test.shape}")
    
#     # Scale the features
#     scaler = StandardScaler()
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)
    
#     print(f"Training set shape after scaling: {X_train_scaled.shape}")
#     print(f"Test set shape after scaling: {X_test_scaled.shape}")
    
#     return X_train_scaled, X_test_scaled, y_train, y_test, scaler, feature_list, label_encoders

# def custom_score(y_true, y_pred):
#     mse = mean_squared_error(y_true, y_pred)
#     mae = mean_absolute_error(y_true, y_pred)
#     r2 = r2_score(y_true, y_pred)
#     # Avoid division by zero
#     score = r2  # Start with R²
#     if mse > 0:
#         score += 1 / (1 + mse)  # Add inverse MSE (normalized)
#     if mae > 0:
#         score += 1 / (1 + mae)  # Add inverse MAE (normalized)
#     return score

# def optimize_xgboost(X_train_scaled, y_train, n_trials, device='cuda'):
#     def objective(trial):
#         params = {
#             'max_depth': trial.suggest_int('max_depth', 3, 20),
#             'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
#             'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
#             'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#             'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
#             'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
#             'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
#             'max_leaves': trial.suggest_int('max_leaves', 0, 1000),
#             'max_bin': trial.suggest_int('max_bin', 200, 1000),
#             'device': device,
#             'tree_method': 'hist' if device == 'cuda' else 'auto'
#         }
      
#         model = XGBRegressor(**params, random_state=42)
#         try:
#             scores = cross_val_score(
#                 model, X_train_scaled, y_train,
#                 cv=5,
#                 scoring=make_scorer(custom_score, greater_is_better=True),
#                 n_jobs=1 if device == 'cuda' else -1
#             )
#             return -scores.mean()
#         except Exception as e:
#             print(f"Error in XGBoost trial: {e}")
#             return float('inf')

#     study = optuna.create_study(direction='minimize')
#     study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
#     return study.best_params

# def optimize_lightgbm(X_train_scaled, y_train, n_trials, device='gpu'):
#     def objective(trial):
#         params = {
#             'num_leaves': trial.suggest_int('num_leaves', 5, 100),
#             'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
#             'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
#             'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
#             'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#             'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
#             'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
#             'max_depth': trial.suggest_int('max_depth', -1, 20),
#             'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1.0, log=True),
#             'max_bin': trial.suggest_int('max_bin', 200, 1000),
#             'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'goss']),
#             'device': device,
#             'gpu_use_dp': True if device == 'gpu' else False,
#             'force_col_wise': True if device == 'gpu' else False
#         }
        
#         model = LGBMRegressor(**params, random_state=42, verbose=-1)
#         try:
#             scores = cross_val_score(
#                 model, X_train_scaled, y_train,
#                 cv=5,
#                 scoring=make_scorer(custom_score, greater_is_better=True),
#                 n_jobs=1 if device == 'gpu' else -1
#             )
#             return -scores.mean()
#         except Exception as e:
#             print(f"Error in LightGBM trial: {e}")
#             return float('inf')

#     study = optuna.create_study(direction='minimize')
#     study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
#     return study.best_params

# def train_and_evaluate_model(model_class, best_params, X_train_scaled, y_train, X_test_scaled, y_test, features, model_name):
#     # Add verbose=-1 for LightGBM to reduce output
#     if model_class == LGBMRegressor:
#         best_params['verbose'] = -1
    
#     model = model_class(**best_params, random_state=42)
    
#     print(f"Training {model_name} on {'GPU' if ('device' in best_params and best_params['device'] in ['cuda', 'gpu']) else 'CPU'}...")
#     model.fit(X_train_scaled, y_train)
    
#     y_pred = model.predict(X_test_scaled)
#     mse = mean_squared_error(y_test, y_pred)
#     rmse = np.sqrt(mse)
#     r2 = r2_score(y_test, y_pred)
    
#     print(f'{model_name} Results:')
#     print(f'Mean Squared Error: {mse:.6f}')
#     print(f'Root Mean Squared Error: {rmse:.6f}')
#     print(f'R-squared Score: {r2:.6f}')
    
#     joblib.dump(model, f'best_{model_name.lower()}_model.pkl')
    
#     plot_feature_importance(model, features, model_name)
#     plot_predicted_vs_true(y_test, y_pred, mse, rmse, r2, model_name)
    
#     return model, mse, rmse, r2

# def plot_feature_importance(model, features, model_name):
#     feature_importance = model.feature_importances_
    
#     plt.figure(figsize=(12, 8))
#     sns.barplot(x=feature_importance, y=features, orient='h', palette='viridis')
#     plt.title(f'Feature Importance - {model_name}', fontsize=16)
#     plt.xlabel('Importance', fontsize=12)
#     plt.ylabel('Features', fontsize=12)
#     plt.tight_layout()
#     plt.savefig(f'{model_name.lower()}_feature_importance.png', dpi=300, bbox_inches='tight')
#     plt.close()

# def plot_predicted_vs_true(y_test, y_pred, mse, rmse, r2, model_name):
#     plt.figure(figsize=(10, 8))
#     plt.scatter(y_test, y_pred, alpha=0.5)
#     plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
#     plt.xlabel('True xG', fontsize=12)
#     plt.ylabel('Predicted xG', fontsize=12)
#     plt.title(f'{model_name} - Predicted vs. True xG', fontsize=16)
#     plt.annotate(f'MSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR²: {r2:.3f}',
#                  xy=(0.05, 0.95), xycoords='axes fraction',
#                  bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8),
#                  fontsize=10)
#     plt.tight_layout()
#     plt.savefig(f'{model_name.lower()}_predicted_vs_true.png', dpi=300, bbox_inches='tight')
#     plt.close()

# def save_model(model, model_name, accuracy, error):
#     filename = f"{model_name}_acc_{accuracy:.4f}_err_{error:.4f}.pkl"
#     joblib.dump(model, filename)
#     print(f"Model saved as {filename}")

# def save_metrics_and_params(model_name, metrics, params, n_trials, n_features):
#     data = {
#         "model_name": model_name,
#         "metrics": metrics,
#         "hyperparameters": params,
#         "n_trials": n_trials,
#         "n_features": n_features
#     }
#     filename = f"{model_name}_metrics_and_params.json"
#     with open(filename, 'w') as f:
#         json.dump(data, f, indent=4)
#     print(f"Metrics and parameters saved as {filename}")

# def compare_models(data, features, target):
#     """
#     Compare XGBoost and LightGBM models with proper error handling
#     """
#     print("Starting model comparison...")
    
#     # Check GPU availability first
#     xgb_device, lgb_device = check_gpu_availability()
    
#     # Prepare data with enhanced error handling
#     try:
#         X_train_scaled, X_test_scaled, y_train, y_test, scaler, feature_list, label_encoders = prepare_data(data, features, target)
#     except Exception as e:
#         print(f"Error in data preparation: {e}")
#         print("\nDebugging information:")
#         print(f"Data columns: {list(data.columns) if hasattr(data, 'columns') else 'Not a DataFrame'}")
#         print(f"Data shape: {data.shape if hasattr(data, 'shape') else 'No shape attribute'}")
#         print(f"Features: {features}")
#         print(f"Target: {target}")
#         raise
    
#     n_trials = 100 
#     n_features = len(feature_list)
    
#     try:
#         print(f"\nOptimizing XGBoost on {xgb_device.upper()}...")
#         xgb_best_params = optimize_xgboost(X_train_scaled, y_train, n_trials, xgb_device)
#         print("XGBoost Best Parameters:", xgb_best_params)
        
#         print(f"\nOptimizing LightGBM on {lgb_device.upper()}...")
#         lgb_best_params = optimize_lightgbm(X_train_scaled, y_train, n_trials, lgb_device)
#         print("LightGBM Best Parameters:", lgb_best_params)
        
#         print("\nTraining and Evaluating XGBoost...")
#         xgb_model, xgb_mse, xgb_rmse, xgb_r2 = train_and_evaluate_model(
#             XGBRegressor, xgb_best_params, X_train_scaled, y_train, X_test_scaled, y_test, feature_list, "XGBoost"
#         )
        
#         print("\nTraining and Evaluating LightGBM...")
#         lgb_model, lgb_mse, lgb_rmse, lgb_r2 = train_and_evaluate_model(
#             LGBMRegressor, lgb_best_params, X_train_scaled, y_train, X_test_scaled, y_test, feature_list, "LightGBM"
#         )
        
#         print("\nModel Comparison:")
#         print(f"XGBoost - MSE: {xgb_mse:.4f}, RMSE: {xgb_rmse:.4f}, R²: {xgb_r2:.4f}")
#         print(f"LightGBM - MSE: {lgb_mse:.4f}, RMSE: {lgb_rmse:.4f}, R²: {lgb_r2:.4f}")
        
#         # Determine the best and second-best models
#         models = [
#             ("XGBoost_with_zone14", xgb_model, xgb_r2, xgb_rmse, xgb_best_params),
#             ("LightGBM_with_zone14", lgb_model, lgb_r2, lgb_rmse, lgb_best_params)
#         ]
#         models.sort(key=lambda x: x[2], reverse=True)
        
#         # Save models and metrics
#         for i, (model_name, model, r2, rmse, params) in enumerate(models):
#             save_model(model, model_name, r2, rmse)
            
#             metrics = {
#                 "mse": mean_squared_error(y_test, model.predict(X_test_scaled)),
#                 "rmse": rmse,
#                 "r2": r2
#             }
#             save_metrics_and_params(model_name, metrics, params, n_trials, n_features)
        
#         # Compare model performance
#         if np.isclose(models[0][2], models[1][2], rtol=1e-4):
#             print("\nBoth models performed equally in terms of R² score.")
#         else:
#             print(f"\n{models[0][0]} performed better with R² score of {models[0][2]:.4f}")
#             print(f"{models[1][0]} came second with R² score of {models[1][2]:.4f}")
        
#         # Save preprocessing objects
#         joblib.dump(scaler, 'xgboost&lightgbm_feature_scaler_with_zone14.pkl')
#         joblib.dump(label_encoders, 'label_encoders_with_zone14.pkl')
#         print("Feature scaler and label encoders saved successfully.")
        
#         return models[0][1], models[1][1], scaler, label_encoders  # Return best models and preprocessors
        
#     except Exception as e:
#         print(f"Error during model training/evaluation: {e}")
#         raise

# features = [col for col in enhanced_df.columns if col != 'xG']
# Y = 'xG'  # Target variable

# print(f"Number of features: {len(features)}")
# print(f"Target variable: {Y}")
# print(f"First 10 features: {features[:10]}")


# compare_models(enhanced_df, features, Y)

In [26]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer
from sklearn.datasets import make_regression  # Added missing import
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import json
import joblib
import optuna
import numpy as np
import pandas as pd
import warnings
import gc  # Added missing import
import psutil  # Added missing import
warnings.filterwarnings('ignore')

def check_gpu_availability():
    """
    Robustly check if GPU is available for XGBoost, LightGBM, and CatBoost
    by actually testing model training, not just initialization
    """
    print("Checking GPU availability...")
    
    # Create small test data for actual GPU testing
    X_test, y_test = make_regression(n_samples=100, n_features=5, noise=0.1, random_state=42)
    
    # Check XGBoost GPU support
    xgb_device = 'cpu'
    try:
        import xgboost as xgb
        
        # Test actual GPU training
        model = xgb.XGBRegressor(
            device='cuda',
            tree_method='hist',
            n_estimators=10,
            max_depth=3,
            random_state=42
        )
        model.fit(X_test, y_test)
        _ = model.predict(X_test)  # Test prediction too
        
        print("✓ XGBoost GPU support available and working")
        xgb_device = 'cuda'
        
    except ImportError:
        print("✗ XGBoost not installed")
    except Exception as e:
        print(f"✗ XGBoost GPU support not available: {str(e)[:100]}")
        print("  → Using CPU for XGBoost")
    
    # Check LightGBM GPU support
    lgb_device = 'cpu'
    try:
        import lightgbm as lgb
        
        # Test actual GPU training
        model = lgb.LGBMRegressor(
            device='gpu',
            n_estimators=10,
            max_depth=3,
            verbose=-1,
            random_state=42
        )
        model.fit(X_test, y_test)
        _ = model.predict(X_test)
        
        print("✓ LightGBM GPU support available and working")
        lgb_device = 'gpu'
        
    except ImportError:
        print("✗ LightGBM not installed")
    except Exception as e:
        print(f"✗ LightGBM GPU support not available: {str(e)[:100]}")
        print("  → Using CPU for LightGBM")
    
    # Check CatBoost GPU support
    cb_device = 'CPU'
    try:
        import catboost as cb
        
        # Test actual GPU training
        model = cb.CatBoostRegressor(
            task_type='GPU',
            iterations=10,
            depth=3,
            verbose=False,
            random_seed=42
        )
        model.fit(X_test, y_test)
        _ = model.predict(X_test)
        
        print("✓ CatBoost GPU support available and working")
        cb_device = 'GPU'
        
    except ImportError:
        print("✗ CatBoost not installed")
    except Exception as e:
        print(f"✗ CatBoost GPU support not available: {str(e)[:100]}")
        print("  → Using CPU for CatBoost")
    
    print(f"\nFinal device configuration:")
    print(f"  XGBoost: {xgb_device.upper()}")
    print(f"  LightGBM: {lgb_device.upper()}")
    print(f"  CatBoost: {cb_device}")
    
    return xgb_device, lgb_device, cb_device

def get_memory_usage():
    """Get current memory usage in MB"""
    try:
        process = psutil.Process()
        return process.memory_info().rss / 1024 / 1024
    except:
        return 0.0  # Return 0 if psutil fails

def prepare_data_with_memory_management(data, features, target, scale_for_tree_models=True):
    """
    Enhanced data preparation with consistent scaling and memory management
    """
    print(f"Initial memory usage: {get_memory_usage():.1f} MB")
    print(f"Data shape: {data.shape}")
    
    # Convert features to list if needed
    if isinstance(features, pd.DataFrame):
        feature_list = features.columns.tolist()
    elif isinstance(features, pd.Series):
        feature_list = features.tolist()
    elif isinstance(features, (list, tuple)):
        feature_list = list(features)
    else:
        raise ValueError(f"Features must be a list, tuple, Series, or DataFrame. Got {type(features)}")
    
    # Convert target to string
    if isinstance(target, pd.Series):
        target_name = target.name if target.name else str(target.iloc[0])
    elif isinstance(target, (list, tuple)):
        target_name = target[0] if len(target) == 1 else str(target)
    else:
        target_name = str(target)
    
    # Validate features and target exist
    missing_features = [f for f in feature_list if f not in data.columns]
    if missing_features:
        raise ValueError(f"Features not found in data: {missing_features}")
    
    if target_name not in data.columns:
        raise ValueError(f"Target '{target_name}' not found in data")
    
    print(f"Using {len(feature_list)} features and target: {target_name}")
    
    # Create copies to avoid modifying original data
    X = data[feature_list].copy()
    y = data[target_name].copy()
    
    print(f"Memory after data selection: {get_memory_usage():.1f} MB")
    
    # Identify categorical and numeric columns
    categorical_columns = []
    numeric_columns = []
    
    for col in X.columns:
        if X[col].dtype in ['object', 'category']:
            categorical_columns.append(col)
        elif pd.api.types.is_numeric_dtype(X[col]):
            numeric_columns.append(col)
        else:
            # Handle edge cases
            try:
                pd.to_numeric(X[col])
                numeric_columns.append(col)
            except:
                categorical_columns.append(col)
    
    print(f"Categorical columns ({len(categorical_columns)}): {categorical_columns[:5]}...")
    print(f"Numeric columns ({len(numeric_columns)}): {numeric_columns[:5]}...")
    
    # Encode categorical variables
    label_encoders = {}
    if categorical_columns:
        print("Encoding categorical variables...")
        for col in categorical_columns:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            label_encoders[col] = le
    
    # Handle missing values
    if X.isnull().sum().sum() > 0:
        print("Handling missing values...")
        for col in X.columns:
            if X[col].isnull().sum() > 0:
                if col in categorical_columns:
                    X[col].fillna(X[col].mode()[0], inplace=True)
                else:
                    X[col].fillna(X[col].median(), inplace=True)
    
    # Remove rows with missing target values
    if y.isnull().sum() > 0:
        print(f"Removing {y.isnull().sum()} rows with missing target values")
        mask = ~y.isnull()
        X = X[mask]
        y = y[mask]
    
    # Ensure all data is numeric
    for col in X.columns:
        if not pd.api.types.is_numeric_dtype(X[col]):
            X[col] = pd.to_numeric(X[col], errors='coerce')
            if X[col].isnull().sum() > 0:
                X[col].fillna(X[col].median(), inplace=True)
    
    print(f"Memory after preprocessing: {get_memory_usage():.1f} MB")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    
    # Prepare different versions for different model types
    data_versions = {
        'original': {
            'X_train': X_train.copy(),
            'X_test': X_test.copy(),
            'description': 'Original data (best for CatBoost)'
        }
    }
    
    # Scale data if requested (for XGBoost and LightGBM)
    if scale_for_tree_models:
        print("Creating scaled versions for tree models...")
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(
            scaler.fit_transform(X_train),
            columns=X_train.columns,
            index=X_train.index
        )
        X_test_scaled = pd.DataFrame(
            scaler.transform(X_test),
            columns=X_test.columns,
            index=X_test.index
        )
        
        data_versions['scaled'] = {
            'X_train': X_train_scaled,
            'X_test': X_test_scaled,
            'scaler': scaler,
            'description': 'Scaled data (best for XGBoost/LightGBM if using scaled features)'
        }
    else:
        data_versions['scaled'] = data_versions['original']
        print("Skipping scaling - using original data for all models")
    
    print(f"Final memory usage: {get_memory_usage():.1f} MB")
    
    return {
        'data_versions': data_versions,
        'y_train': y_train,
        'y_test': y_test,
        'feature_list': feature_list,
        'label_encoders': label_encoders,
        'categorical_columns': categorical_columns,
        'target_name': target_name,
        'scaler': data_versions['scaled'].get('scaler', None) 
    }

def custom_score(y_true, y_pred):
    """Enhanced scoring function combining multiple metrics"""
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    # Combine metrics for a comprehensive score
    score = r2  # Start with R²
    if mse > 0:
        score += 1 / (1 + mse)  # Add inverse MSE (normalized)
    if mae > 0:
        score += 1 / (1 + mae)  # Add inverse MAE (normalized)
    return score

def optimize_xgboost(X_train_scaled, y_train, n_trials, device='cuda'):
    """Optimize XGBoost hyperparameters using Optuna"""
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
            'max_leaves': trial.suggest_int('max_leaves', 0, 1000),
            'max_bin': trial.suggest_int('max_bin', 200, 1000),
            'device': device,
            'tree_method': 'hist' if device == 'cuda' else 'auto'
        }
      
        model = XGBRegressor(**params, random_state=42)
        try:
            scores = cross_val_score(
                model, X_train_scaled, y_train,
                cv=5,
                scoring=make_scorer(custom_score, greater_is_better=True),
                n_jobs=1 if device == 'cuda' else -1
            )
            return -scores.mean()
        except Exception as e:
            print(f"Error in XGBoost trial: {e}")
            return float('inf')

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params

def optimize_lightgbm(X_train_scaled, y_train, n_trials, device='gpu'):
    """Optimize LightGBM hyperparameters using Optuna"""
    def objective(trial):
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 5, 100),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'max_depth': trial.suggest_int('max_depth', -1, 20),
            'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1.0, log=True),
            'max_bin': trial.suggest_int('max_bin', 200, 1000),
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'goss']),
            'device': device,
            'gpu_use_dp': True if device == 'gpu' else False,
            'force_col_wise': True if device == 'gpu' else False
        }
        
        model = LGBMRegressor(**params, random_state=42, verbose=-1)
        try:
            scores = cross_val_score(
                model, X_train_scaled, y_train,
                cv=5,
                scoring=make_scorer(custom_score, greater_is_better=True),
                n_jobs=1 if device == 'gpu' else -1
            )
            return -scores.mean()
        except Exception as e:
            print(f"Error in LightGBM trial: {e}")
            return float('inf')

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params

def optimize_catboost(X_train, y_train, n_trials, device='GPU', cat_features=None):
    """Optimize CatBoost hyperparameters using Optuna"""
    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 100, 3000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
            'depth': trial.suggest_int('depth', 3, 16),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
            'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
            'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
            'od_wait': trial.suggest_int('od_wait', 10, 50),
            'task_type': device,
            'verbose': False,
            'random_seed': 42
        }
        
        # Add bootstrap-specific parameters
        if params['bootstrap_type'] == 'Bayesian':
            params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 1.0)
        elif params['bootstrap_type'] == 'Bernoulli':
            params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
        
        model = CatBoostRegressor(**params)
        try:
            scores = cross_val_score(
                model, X_train, y_train,
                cv=5,
                scoring=make_scorer(custom_score, greater_is_better=True),
                n_jobs=1
            )
            return -scores.mean()
        except Exception as e:
            print(f"Error in CatBoost trial: {e}")
            return float('inf')

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params

def create_feature_importance_plot(model, feature_names, model_name, top_n=20):
    """Create and save feature importance plot"""
    if not hasattr(model, 'feature_importances_'):
        print(f"Model {model_name} doesn't have feature_importances_ attribute")
        return None
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False).head(top_n)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(data=importance_df, x='importance', y='feature', palette='viridis')
    plt.title(f'{model_name} - Top {top_n} Feature Importance')
    plt.xlabel('Feature Importance')
    plt.tight_layout()
    
    plot_filename = f'{model_name.lower()}_feature_importance.png'
    plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
    plt.close()  # Important for memory management
    
    print(f"Feature importance plot saved to {plot_filename}")
    return plot_filename



def train_model_with_memory_management(model_class, params, data_prep, model_name, use_scaled=True):
    """
    Train a single model with proper memory management
    """
    print(f"\n{'='*50}")
    print(f"Training {model_name}")
    print(f"{'='*50}")
    
    initial_memory = get_memory_usage()
    print(f"Initial memory: {initial_memory:.1f} MB")
    
    # Choose appropriate data version
    if use_scaled and 'scaled' in data_prep['data_versions']:
        data_version = data_prep['data_versions']['scaled']
        print(f"Using scaled data for {model_name}")
    else:
        data_version = data_prep['data_versions']['original']
        print(f"Using original data for {model_name}")
    
    X_train = data_version['X_train']
    X_test = data_version['X_test']
    y_train = data_prep['y_train']
    y_test = data_prep['y_test']
    
    # Add model-specific verbose settings
    if model_class.__name__ == 'LGBMRegressor':
        params['verbose'] = -1
    elif model_class.__name__ == 'CatBoostRegressor':
        params['verbose'] = False
    
    # Initialize and train model
    model = model_class(**params, random_state=42)
    
    print(f"Training {model_name}...")
    training_memory = get_memory_usage()
    
    # Train with categorical features for CatBoost
    if model_class.__name__ == 'CatBoostRegressor' and data_prep['categorical_columns']:
        # Map categorical column names to indices
        cat_feature_indices = [i for i, col in enumerate(data_prep['feature_list']) 
                              if col in data_prep['categorical_columns']]
        model.fit(X_train, y_train, cat_features=cat_feature_indices)
    else:
        model.fit(X_train, y_train)
    
    post_training_memory = get_memory_usage()
    print(f"Memory during training: {training_memory:.1f} MB")
    print(f"Memory after training: {post_training_memory:.1f} MB")
    print(f"Training memory increase: {post_training_memory - training_memory:.1f} MB")
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'mse': mean_squared_error(y_test, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'mae': mean_absolute_error(y_test, y_pred),
        'r2': r2_score(y_test, y_pred),
        'mape': np.mean(np.abs((y_test - y_pred) / np.maximum(np.abs(y_test), 1e-8))) * 100,  # Fixed division by zero
        'max_error': np.max(np.abs(y_test - y_pred)),
        'explained_variance': 1 - np.var(y_test - y_pred) / np.var(y_test)
    }

    # Create feature importance plot
    plot_filename = create_feature_importance_plot(
        model, 
        data_prep['feature_list'], 
        model_name, 
        top_n=min(20, len(data_prep['feature_list']))
    )
    
    # Add to result dictionary:
   
    print(f"\n{model_name} Results:")
    for metric, value in metrics.items():
        print(f"{metric.upper()}: {value:.6f}")
    
    final_memory = get_memory_usage()
    print(f"Final memory: {final_memory:.1f} MB")
    print(f"Total memory increase: {final_memory - initial_memory:.1f} MB")
    
    # Save model with memory-efficient approach
    model_filename = f'best_{model_name.lower()}_model.pkl'
    joblib.dump(model, model_filename)
    print(f"Model saved to {model_filename}")

    preprocessing_objects = {
        'scaler': data_prep.get('scaler'),
        'label_encoders': data_prep['label_encoders'],
        'feature_list': data_prep['feature_list'],
        'categorical_columns': data_prep['categorical_columns'],
        'target_name': data_prep['target_name']
    }
    preprocessing_filename = f'{model_name.lower()}_preprocessing.pkl'
    joblib.dump(preprocessing_objects, preprocessing_filename)
    print(f"Preprocessing objects saved to {preprocessing_filename}")
    
    result = {
        'model_name': model_name,
        'metrics': metrics,
        'params': params,
        'feature_importance': model.feature_importances_ if hasattr(model, 'feature_importances_') else None,
        'model_file': model_filename,
        'preprocessing_file': preprocessing_filename,  
        'importance_plot': plot_filename  
    }
    

    del model, X_train, X_test
    gc.collect()
    
    cleanup_memory = get_memory_usage()
    print(f"Memory after cleanup: {cleanup_memory:.1f} MB")
    
    return result

def compare_models_memory_efficient(data, features, target, models_to_compare=None, n_trials=100):
    """
    Memory-efficient model comparison that processes one model at a time
    """
    if models_to_compare is None:
        models_to_compare = ['XGBoost', 'LightGBM', 'CatBoost']
    
    print("Starting memory-efficient model comparison...")
    print(f"Initial system memory: {get_memory_usage():.1f} MB")
    
    # Check GPU availability
    xgb_device, lgb_device, cb_device = check_gpu_availability()
    
    # Prepare data once
    print("\nPreparing data...")
    data_prep = prepare_data_with_memory_management(data, features, target)
    
    # Store results without keeping models in memory
    all_results = {}
    
    # Process each model individually
    for model_name in models_to_compare:
        print(f"\n{'='*60}")
        print(f"Processing {model_name}")
        print(f"{'='*60}")
        
        try:
            if model_name == 'XGBoost':
                # Optimize hyperparameters
                print(f"Optimizing XGBoost hyperparameters (n_trials={n_trials})...")
                best_params = optimize_xgboost(
                    data_prep['data_versions']['scaled']['X_train'],
                    data_prep['y_train'],
                    n_trials,
                    xgb_device
                )
                result = train_model_with_memory_management(
                    XGBRegressor, best_params, data_prep, model_name, use_scaled=True
                )
            
            elif model_name == 'LightGBM':
                # Optimize hyperparameters
                print(f"Optimizing LightGBM hyperparameters (n_trials={n_trials})...")
                best_params = optimize_lightgbm(
                    data_prep['data_versions']['scaled']['X_train'],
                    data_prep['y_train'],
                    n_trials,
                    lgb_device
                )
                result = train_model_with_memory_management(
                    LGBMRegressor, best_params, data_prep, model_name, use_scaled=True
                )
            
            elif model_name == 'CatBoost':
                # Optimize hyperparameters
                print(f"Optimizing CatBoost hyperparameters (n_trials={n_trials})...")
                best_params = optimize_catboost(
                    data_prep['data_versions']['original']['X_train'],
                    data_prep['y_train'],
                    n_trials,
                    cb_device,
                    data_prep['categorical_columns']
                )
                result = train_model_with_memory_management(
                    CatBoostRegressor, best_params, data_prep, model_name, use_scaled=False
                )
            
            all_results[model_name] = result
            
        except Exception as e:
            print(f"Error training {model_name}: {e}")
            continue
        
        # Force garbage collection between models
        gc.collect()
        print(f"Memory after {model_name} cleanup: {get_memory_usage():.1f} MB")
    
    # Rank models by performance
    if all_results:
        print(f"\n{'='*60}")
        print("FINAL RESULTS")
        print(f"{'='*60}")
        
        model_rankings = sorted(all_results.items(), key=lambda x: x[1]['metrics']['r2'], reverse=True)
        
        for i, (model_name, results) in enumerate(model_rankings, 1):
            print(f"{i}. {model_name}: R² = {results['metrics']['r2']:.6f}")
        
        best_model_name = model_rankings[0][0]
        best_metrics = model_rankings[0][1]['metrics']
        
        print(f"\nBest Model: {best_model_name}")
        print(f"Best R² Score: {best_metrics['r2']:.6f}")
        print(f"Best RMSE: {best_metrics['rmse']:.6f}")
        
        # Save results summary
        results_summary = {
            'best_model': best_model_name,
            'model_rankings': [(name, results['metrics']['r2']) for name, results in model_rankings],
            'detailed_results': {name: results['metrics'] for name, results in all_results.items()}
        }
        
        with open('model_comparison_results.json', 'w') as f:
            json.dump(results_summary, f, indent=4)
        
        print("\nResults saved to 'model_comparison_results.json'")
        
        return all_results, best_model_name, best_metrics
    
    else:
        print("No models were successfully trained!")
        return {}, None, None

results, best_model, best_metrics = compare_models_memory_efficient(
    data=enhanced_df,
    features=[col for col in enhanced_df.columns if col != 'xG'],
    target='xG',
    n_trials=50  
)



print(f"\nComparison completed!")
if best_model:
    print(f"Best model: {best_model}")
    print(f"Best R²: {best_metrics['r2']:.6f}")

Starting memory-efficient model comparison...
Initial system memory: 729.4 MB
Checking GPU availability...
✓ XGBoost GPU support available and working




✓ LightGBM GPU support available and working
✓ CatBoost GPU support available and working

Final device configuration:
  XGBoost: CUDA
  LightGBM: GPU
  CatBoost: GPU

Preparing data...
Initial memory usage: 1126.6 MB
Data shape: (87111, 36)
Using 35 features and target: xG
Memory after data selection: 1126.6 MB
Categorical columns (1): ['shot_direction_category']...
Numeric columns (34): ['starting_x', 'starting_y', 'end_x', 'end_y', 'distance_to_goal']...
Encoding categorical variables...
Memory after preprocessing: 1128.1 MB
Train shape: (69688, 35), Test shape: (17423, 35)
Creating scaled versions for tree models...


[I 2025-05-26 21:05:50,905] A new study created in memory with name: no-name-59154b93-7eac-4855-8ece-02b80066a3ad


Final memory usage: 1129.7 MB

Processing XGBoost
Optimizing XGBoost hyperparameters (n_trials=50)...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-05-26 21:06:30,889] Trial 0 finished with value: -2.4788470198529313 and parameters: {'max_depth': 12, 'learning_rate': 0.3761877427722887, 'n_estimators': 1906, 'min_child_weight': 2, 'subsample': 0.8046678859191494, 'colsample_bytree': 0.6460625868180361, 'gamma': 7.949845943523055e-05, 'alpha': 4.593450425693394e-06, 'lambda': 8.941522374306882e-08, 'max_leaves': 269, 'max_bin': 211}. Best is trial 0 with value: -2.4788470198529313.
[I 2025-05-26 21:07:34,131] Trial 1 finished with value: -2.61233095627862 and parameters: {'max_depth': 17, 'learning_rate': 0.0195215336492488, 'n_estimators': 2334, 'min_child_weight': 13, 'subsample': 0.9723216895159164, 'colsample_bytree': 0.7350412654841763, 'gamma': 0.002645803182249833, 'alpha': 0.001653702589707541, 'lambda': 8.588486766274157e-07, 'max_leaves': 902, 'max_bin': 839}. Best is trial 1 with value: -2.61233095627862.
[I 2025-05-26 21:10:32,896] Trial 2 finished with value: -2.593151838472597 and parameters: {'max_depth': 20,

[I 2025-05-26 21:56:20,871] A new study created in memory with name: no-name-76a6d480-35b5-4318-9261-ed21ae9afe88


Memory after cleanup: 1700.0 MB
Memory after XGBoost cleanup: 1700.0 MB

Processing LightGBM
Optimizing LightGBM hyperparameters (n_trials=50)...


  0%|          | 0/50 [00:00<?, ?it/s]

[LightGBM] [Fatal] bin size 833 cannot run on GPU
[LightGBM] [Fatal] bin size 833 cannot run on GPU
[LightGBM] [Fatal] bin size 833 cannot run on GPU
[LightGBM] [Fatal] bin size 833 cannot run on GPU
[LightGBM] [Fatal] bin size 833 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 557 cannot run on GPU
[LightGBM] [Fatal] bin size 557 cannot run on GPU
[LightGBM] [Fatal] bin size 557 cannot run on GPU
[LightGBM] [Fatal] bin size 557 cannot run on GPU
[LightGBM] [Fatal] bin size 557 cannot run on GPU
[LightGBM] [Fatal] bin size 323 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 323 cannot run on GPU
[LightGBM] [Fatal] bin size 323 cannot run on GPU
[LightGBM] [Fatal] bin size 323 cannot run on GPU
[LightGBM] [Fatal] bin size 323 cannot run on GPU
[LightGBM] [Fatal] bin size 345 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 345 cannot run on GPU
[LightGBM] [Fatal] bin size 345 cannot run on GPU
[LightGBM] [Fatal] bin size 345 cannot run on GPU
[LightGBM] [Fatal] bin size 345 cannot run on GPU
[LightGBM] [Fatal] bin size 504 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 504 cannot run on GPU
[LightGBM] [Fatal] bin size 504 cannot run on GPU
[LightGBM] [Fatal] bin size 504 cannot run on GPU
[LightGBM] [Fatal] bin size 504 cannot run on GPU
[LightGBM] [Fatal] bin size 575 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 575 cannot run on GPU
[LightGBM] [Fatal] bin size 575 cannot run on GPU
[LightGBM] [Fatal] bin size 575 cannot run on GPU
[LightGBM] [Fatal] bin size 575 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 420 cannot run on GPU
[LightGBM] [Fatal] bin size 420 cannot run on GPU
[LightGBM] [Fatal] bin size 420 cannot run on GPU
[LightGBM] [Fatal] bin size 420 cannot run on GPU
[LightGBM] [Fatal] bin size 420 cannot run on GPU
[LightGBM] [Fatal] bin size 417 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 417 cannot run on GPU
[LightGBM] [Fatal] bin size 417 cannot run on GPU
[LightGBM] [Fatal] bin size 417 cannot run on GPU
[LightGBM] [Fatal] bin size 417 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 270 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 270 cannot run on GPU
[LightGBM] [Fatal] bin size 270 cannot run on GPU
[LightGBM] [Fatal] bin size 270 cannot run on GPU
[LightGBM] [Fatal] bin size 270 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 837 cannot run on GPU
[LightGBM] [Fatal] bin size 837 cannot run on GPU
[LightGBM] [Fatal] bin size 837 cannot run on GPU
[LightGBM] [Fatal] bin size 837 cannot run on GPU
[LightGBM] [Fatal] bin size 837 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 714 cannot run on GPU
[LightGBM] [Fatal] bin size 714 cannot run on GPU
[LightGBM] [Fatal] bin size 714 cannot run on GPU
[LightGBM] [Fatal] bin size 714 cannot run on GPU
[LightGBM] [Fatal] bin size 714 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 876 cannot run on GPU
[LightGBM] [Fatal] bin size 876 cannot run on GPU
[LightGBM] [Fatal] bin size 876 cannot run on GPU
[LightGBM] [Fatal] bin size 876 cannot run on GPU
[LightGBM] [Fatal] bin size 876 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU
[LightGBM] [Fatal] bin size 705 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 610 cannot run on GPU
[LightGBM] [Fatal] bin size 610 cannot run on GPU
[LightGBM] [Fatal] bin size 610 cannot run on GPU
[LightGBM] [Fatal] bin size 610 cannot run on GPU
[LightGBM] [Fatal] bin size 610 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 844 cannot run on GPU
[LightGBM] [Fatal] bin size 844 cannot run on GPU
[LightGBM] [Fatal] bin size 844 cannot run on GPU
[LightGBM] [Fatal] bin size 844 cannot run on GPU
[LightGBM] [Fatal] bin size 844 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU
[LightGBM] [Fatal] bin size 1001 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 588 cannot run on GPU
[LightGBM] [Fatal] bin size 588 cannot run on GPU
[LightGBM] [Fatal] bin size 588 cannot run on GPU
[LightGBM] [Fatal] bin size 588 cannot run on GPU
[LightGBM] [Fatal] bin size 588 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal



[I 2025-05-26 21:58:01,903] Trial 19 finished with value: -2.6207093271534863 and parameters: {'num_leaves': 90, 'learning_rate': 0.034354471227977065, 'n_estimators': 828, 'min_child_samples': 154, 'subsample': 0.9282738932069547, 'colsample_bytree': 0.7786405502709826, 'reg_alpha': 0.00045010402417663476, 'reg_lambda': 0.353882202225327, 'max_depth': 17, 'min_split_gain': 4.972677397406122e-07, 'max_bin': 205, 'boosting_type': 'goss'}. Best is trial 19 with value: -2.6207093271534863.
[I 2025-05-26 21:59:17,402] Trial 20 finished with value: -2.6215763420459197 and parameters: {'num_leaves': 95, 'learning_rate': 0.03202852781642406, 'n_estimators': 839, 'min_child_samples': 157, 'subsample': 0.9248431312249448, 'colsample_bytree': 0.7527012336433787, 'reg_alpha': 0.0004029336969510896, 'reg_lambda': 8.616938322896777, 'max_depth': 19, 'min_split_gain': 1.0178141320079594e-06, 'max_bin': 203, 'boosting_type': 'goss'}. Best is trial 20 with value: -2.6215763420459197.
[I 2025-05-26 22:

[LightGBM] [Fatal] bin size 279 cannot run on GPU
[LightGBM] [Fatal] bin size 279 cannot run on GPU
[LightGBM] [Fatal] bin size 279 cannot run on GPU
[LightGBM] [Fatal] bin size 279 cannot run on GPU
[LightGBM] [Fatal] bin size 279 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 384 cannot run on GPU
[LightGBM] [Fatal] bin size 384 cannot run on GPU
[LightGBM] [Fatal] bin size 384 cannot run on GPU
[LightGBM] [Fatal] bin size 384 cannot run on GPU
[LightGBM] [Fatal] bin size 384 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 283 cannot run on GPU
[LightGBM] [Fatal] bin size 283 cannot run on GPU
[LightGBM] [Fatal] bin size 283 cannot run on GPU
[LightGBM] [Fatal] bin size 283 cannot run on GPU
[LightGBM] [Fatal] bin size 283 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 478 cannot run on GPU
[LightGBM] [Fatal] bin size 478 cannot run on GPU
[LightGBM] [Fatal] bin size 478 cannot run on GPU
[LightGBM] [Fatal] bin size 478 cannot run on GPU
[LightGBM] [Fatal] bin size 478 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 329 cannot run on GPU
[LightGBM] [Fatal] bin size 329 cannot run on GPU
[LightGBM] [Fatal] bin size 329 cannot run on GPU
[LightGBM] [Fatal] bin size 329 cannot run on GPU
[LightGBM] [Fatal] bin size 329 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 269 cannot run on GPU
[LightGBM] [Fatal] bin size 269 cannot run on GPU
[LightGBM] [Fatal] bin size 269 cannot run on GPU
[LightGBM] [Fatal] bin size 269 cannot run on GPU
[LightGBM] [Fatal] bin size 269 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 372 cannot run on GPU
[LightGBM] [Fatal] bin size 372 cannot run on GPU
[LightGBM] [Fatal] bin size 372 cannot run on GPU
[LightGBM] [Fatal] bin size 372 cannot run on GPU
[LightGBM] [Fatal] bin size 372 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 293 cannot run on GPU
[LightGBM] [Fatal] bin size 293 cannot run on GPU
[LightGBM] [Fatal] bin size 293 cannot run on GPU
[LightGBM] [Fatal] bin size 293 cannot run on GPU
[LightGBM] [Fatal] bin size 293 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 340 cannot run on GPU
[LightGBM] [Fatal] bin size 340 cannot run on GPU
[LightGBM] [Fatal] bin size 340 cannot run on GPU
[LightGBM] [Fatal] bin size 340 cannot run on GPU
[LightGBM] [Fatal] bin size 340 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal



[I 2025-05-26 22:08:01,881] Trial 36 finished with value: -2.626859415232989 and parameters: {'num_leaves': 83, 'learning_rate': 0.017157538783294967, 'n_estimators': 1000, 'min_child_samples': 172, 'subsample': 0.8982672515806632, 'colsample_bytree': 0.8116433906032052, 'reg_alpha': 0.00011860154001995859, 'reg_lambda': 1.940762280472631, 'max_depth': 19, 'min_split_gain': 1.1014668880836937e-05, 'max_bin': 234, 'boosting_type': 'gbdt'}. Best is trial 36 with value: -2.626859415232989.


[LightGBM] [Fatal] bin size 468 cannot run on GPU
[LightGBM] [Fatal] bin size 468 cannot run on GPU
[LightGBM] [Fatal] bin size 468 cannot run on GPU
[LightGBM] [Fatal] bin size 468 cannot run on GPU
[LightGBM] [Fatal] bin size 468 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 414 cannot run on GPU
[LightGBM] [Fatal] bin size 414 cannot run on GPU
[LightGBM] [Fatal] bin size 414 cannot run on GPU
[LightGBM] [Fatal] bin size 414 cannot run on GPU
[LightGBM] [Fatal] bin size 414 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 309 cannot run on GPU
[LightGBM] [Fatal] bin size 309 cannot run on GPU
[LightGBM] [Fatal] bin size 309 cannot run on GPU
[LightGBM] [Fatal] bin size 309 cannot run on GPU
[LightGBM] [Fatal] bin size 309 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 355 cannot run on GPU
[LightGBM] [Fatal] bin size 355 cannot run on GPU
[LightGBM] [Fatal] bin size 355 cannot run on GPU
[LightGBM] [Fatal] bin size 355 cannot run on GPU
[LightGBM] [Fatal] bin size 355 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 327 cannot run on GPU
[LightGBM] [Fatal] bin size 327 cannot run on GPU
[LightGBM] [Fatal] bin size 327 cannot run on GPU
[LightGBM] [Fatal] bin size 327 cannot run on GPU
[LightGBM] [Fatal] bin size 327 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 532 cannot run on GPU
[LightGBM] [Fatal] bin size 532 cannot run on GPU
[LightGBM] [Fatal] bin size 532 cannot run on GPU
[LightGBM] [Fatal] bin size 532 cannot run on GPU
[LightGBM] [Fatal] bin size 532 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 300 cannot run on GPU
[LightGBM] [Fatal] bin size 300 cannot run on GPU
[LightGBM] [Fatal] bin size 300 cannot run on GPU
[LightGBM] [Fatal] bin size 300 cannot run on GPU
[LightGBM] [Fatal] bin size 300 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 391 cannot run on GPU
[LightGBM] [Fatal] bin size 391 cannot run on GPU
[LightGBM] [Fatal] bin size 391 cannot run on GPU
[LightGBM] [Fatal] bin size 391 cannot run on GPU
[LightGBM] [Fatal] bin size 391 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[LightGBM] [Fatal] bin size 642 cannot run on GPU
[LightGBM] [Fatal] bin size 642 cannot run on GPU
[LightGBM] [Fatal] bin size 642 cannot run on GPU
[LightGBM] [Fatal] bin size 642 cannot run on GPU
[LightGBM] [Fatal] bin size 642 cannot run on GPU


Error in LightGBM trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1092, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/basic.py", line 3437, in __init__
    _safe_cal

[I 2025-05-26 22:10:19,133] A new study created in memory with name: no-name-6f0bcc0c-0100-4c1c-8ec6-0ad44e5c4174


Memory after cleanup: 1719.7 MB
Memory after LightGBM cleanup: 1719.7 MB

Processing CatBoost
Optimizing CatBoost hyperparameters (n_trials=50)...


  0%|          | 0/50 [00:00<?, ?it/s]

Error in CatBoost trial: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
  File "/opt/conda/lib/python3.10/site-packages/catboost/core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
  File "/opt/conda/lib/python3.10/site-packages/catboost/core.py", line 2321, in _prepare_train_params
    _check