<a href="https://colab.research.google.com/github/moglee2699/projects/blob/main/web_scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Function to map the StatsBomb penalty data to the required CSV format
from datetime import datetime

def map_penalty_to_csv_format(penalty, id_number):
    """
    Map a penalty event from StatsBomb to the required CSV format
    """
    # Initialize an empty dictionary with all required fields
    row = {}

    # Calculate Messi's age
    messi_birthdate = "1987-06-24"
    match_date = penalty.get('match_date')
    age = calculate_age(messi_birthdate, match_date)

    # Get basic info
    outcome = penalty.get('shot', {}).get('outcome', {}).get('name', 'Unknown')
    is_home = penalty.get('is_home')
    minute = penalty.get('minute', 0)
    season = penalty.get('season_name')

    # Map outcome to Choice variable (1=Goal, 2=Saved, 3=Missed)
    if outcome == 'Goal':
        choice = 1
    elif outcome == 'Saved':
        choice = 2
    else:  # Off T, Wayward, etc.
        choice = 3

    # Map shot location to the six goal areas (la1-la6)
    # Handle end_location safely - it might have 2 or 3 elements
    end_location = penalty.get('shot', {}).get('end_location', [120, 40, 0])
    if len(end_location) >= 2:
        x, y = end_location[0], end_location[1]
        # The z can be 0 if not available
        z = end_location[2] if len(end_location) > 2 else 0
    else:
        # Default values if end_location is malformed
        x, y, z = 120, 40, 0

    # The six goal areas: simplified mapping based on y-coordinate
    # Assuming the goal is divided into 6 sections from left to right
    goal_width = 20  # Approximation of goal width in StatsBomb coordinates
    goal_left = 30  # Approximate y-coordinate of left post
    goal_right = 50  # Approximate y-coordinate of right post

    # Divide the goal into 6 equal sections
    section_width = goal_width / 6

    # Calculate which section the shot went into
    section_num = min(6, max(1, int(((y - goal_left) / goal_width) * 6) + 1))

    # Initialize all goal areas to 0
    for i in range(1, 7):
        row[f'la{i}'] = 0

    # Set the chosen area to 1
    row[f'la{section_num}'] = 1

    # Set the base fields
    row['ID'] = id_number
    row['ppos'] = 10  # Attacker position
    row['foot'] = 1  # Left-footed (Messi)
    row['Age'] = round(age, 2)
    row['ClubNT'] = 1  # Club match

    # Set location variables
    row['loc'] = 1  # Always a match
    row['locH'] = 1 if is_home else 0
    row['locA'] = 0 if is_home else 1
    row['locN'] = 0  # Not neutral

    # Minute variables
    row['Min'] = minute
    row['Minf30'] = 1 if minute < 30 else 0
    row['Mins30'] = 1 if 30 <= minute < 60 else 0
    row['Mint30'] = 1 if minute >= 60 else 0
    row['Min90'] = 1 if minute >= 90 else 0

    # Competition variables
    row['comp'] = 1  # Always a competition
    row['comLea'] = 1  # League match
    row['comCup'] = 0  # Not a cup match
    row['comECC'] = 0  # Not a European match

    # Goal outcome variables
    row['lpbg'] = 1 if outcome == 'Goal' else 0  # Goal achieved
    row['lGKd'] = 1 if outcome == 'Saved' else 0  # Goalkeeper save

    # Choice variable
    row['Choice'] = choice

    # Add season and match date for reference
    row['season'] = season
    row['match_date'] = match_date

    return row

# Create the dataset
penalty_dataset = []
for i, penalty in enumerate(all_messi_penalties):
    row = map_penalty_to_csv_format(penalty, i + 1)
    penalty_dataset.append(row)

# Convert to DataFrame
penalty_df = pd.DataFrame(penalty_dataset)

# Let's see what columns we have so far
print(f"Columns in dataset so far: {len(penalty_df.columns)}")
print(sorted(penalty_df.columns))

# Fill in remaining columns from the user's sample with default values
sample_columns = [
    'ID', 'ppos', 'def', 'mf', 'st', 'foot', 'Age', 'RepGK', 'HeiGK', 'ClubNT',
    'tpos', 'tup0', 'tup', 'tmid0', 'tmid', 'tbot0', 'tbot', 'otpos', 'oup0', 'oup',
    'omid0', 'omi', 'obot0', 'obot', 'fav', 'favU', 'favF', 'loc', 'locH', 'locN',
    'locA', 'rno', 'rno1', 'rno2', 'rnoGroup', 'rnoLS', 'rnoQF', 'rnoSF', 'rnoF',
    'rko', 'comp', 'comLea', 'comCup', 'comECC', 'comQuali', 'comNTC', 'comFrie',
    'SpecM', 'Imp', 'Importantness Game', 'IngSo', 'iLdbb', 'bb3', 'bb2o', 'bb2',
    'bb1', 'd0', 'il1', 'il2', 'Min', 'Minf30', 'Minf3', 'Mins30', 'Mins3', 'Mint30',
    'Mint3', 'Min900', 'Min90', 'Dec', 'GKSo', 'GKS', 'moveGK', 'notmovingGK', 'nmovGK',
    'cjGrey', 'cjBlack', 'cjBlue', 'cjGreen', 'cjYellow', 'cjOrange', 'cjRed', 'la1',
    'la2', 'la3', 'la4', 'la5', 'la6', 'lpbg', 'lGKd', 'tlpmon', 'Stlsbg', 'SlGKd',
    'Solsbg', 'SR1', 'SRGK1', 'SR2', 'SRGK2', 'SR3', 'SRGK3', 'SR4', 'SRGK4', 'SR5',
    'SRGK5', 'SR6', 'SRGK6', 'MP1', 'MPGK1', 'MP2', 'MPGK2', 'MP3', 'MPGK3', 'MP4',
    'MPGK4', 'MP5', 'MP6', 'MPGK6', 'perc1', 'perc2', 'perc3', 'perc4', 'perc5',
    'perc6', 'av_1', 'av_2', 'av_3', 'av_4', 'av_5', 'av_6', 'move', 'still', 'Choice'
]

# Add any missing columns with default value 0
for col in sample_columns:
    if col not in penalty_df.columns:
        penalty_df[col] = 0

# Ensure columns are in the same order as the sample
for col in ['match_date', 'season']:  # Add these to the final dataset even though they're not in the original sample
    if col in penalty_df.columns:
        sample_columns.append(col)

penalty_df = penalty_df[sample_columns]

# Now let's see the final dataset
print(f"\nFinal dataset has {len(penalty_df)} rows and {len(penalty_df.columns)} columns")
print(penalty_df.head())

# Prepare for CSV export
penalty_df.to_csv("messi_penalties.csv", index=False)
print("\nCSV file created successfully: messi_penalties.csv")

# Display a summary of the dataset
print("\nSummary Statistics:")
print(f"Total penalties: {len(penalty_df)}")
print(f"Goals: {len(penalty_df[penalty_df['Choice'] == 1])}")
print(f"Saved: {len(penalty_df[penalty_df['Choice'] == 2])}")
print(f"Missed: {len(penalty_df[penalty_df['Choice'] == 3])}")
print(f"Success rate: {len(penalty_df[penalty_df['Choice'] == 1]) / len(penalty_df) * 100:.2f}%")

# Show the distribution of shot placement
shot_placement = {
    "la1": len(penalty_df[penalty_df['la1'] == 1]),
    "la2": len(penalty_df[penalty_df['la2'] == 1]),
    "la3": len(penalty_df[penalty_df['la3'] == 1]),
    "la4": len(penalty_df[penalty_df['la4'] == 1]),
    "la5": len(penalty_df[penalty_df['la5'] == 1]),
    "la6": len(penalty_df[penalty_df['la6'] == 1])
}
print("\nShot placement distribution:")
print(shot_placement)

# Let's try to calculate Messi's age range in the dataset
min_age = penalty_df['Age'].min()
max_age = penalty_df['Age'].max()
print(f"\nAge range: {min_age:.2f} to {max_age:.2f} years")

# Display a few examples of the actual data for verification
print("\nSample data:")
for i in range(min(5, len(penalty_df))):
    row = penalty_df.iloc[i]
    print(f"Penalty {i+1}: Age {row['Age']}, Choice {row['Choice']}, " +
          f"Home {row['locH']}, Min {row['Min']}, Shot placement: " +
          f"{'la1' if row['la1']==1 else 'la2' if row['la2']==1 else 'la3' if row['la3']==1 else 'la4' if row['la4']==1 else 'la5' if row['la5']==1 else 'la6'}"
         )