In [46]:
import numpy as np
import os
import joblib
import pandas as pd
from collections import Counter
from jinja2 import Template
from IPython.display import display, HTML

# List of years
years = ["23", "22", "21", "20", "19", "18", "17"]

# List of Pitch Types
pitches21 = pd.read_csv("C:\\Users\\noahb\\Desktop\\Master's Thesis\\2021_01_01_2021_12_31.csv")
count_of_pitch_type_in_Train_Data = Counter(pitches21['pitch_type']) # Counting the appearance of each pith type
pitch_types = [] # List of the pitch types, that have reg. models available

for i, j in count_of_pitch_type_in_Train_Data.items(): # Filter the pitch types with more or equal than 100 appearances
    if pd.isna(i):
        continue
    if j >= 100:
        pitch_types.append(i)

pitch_types.remove('CS')
pitch_types.remove('EP')

# Path to the directory containing the plots
base_dir = "C:\\Users\\noahb\\Desktop\\Master's Thesis\\Regression Models\\Pitcher"

# HTML template for the page
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Best Pitchers by Pitch for 20{{ year }}</title>
    <style>
        body {
            font-family: Arial, sans-serif;
        }
        .home-button {
            position: fixed;
            top: 10px;
            right: 10px;
            background-color: #4CAF50;
            color: white;
            padding: 10px 20px;
            text-decoration: none;
            border-radius: 5px;
        }
        .home-button:hover {
            background-color: #45a049;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin-bottom: 20px;
        }
        table, th, td {
            border: 1px solid black;
        }
        th, td {
            padding: 10px;
            text-align: left;
        }
        h2, h3 {
            text-align: center;
        }
        .separator {
            border: 0;
            height: 1px;
            background: #333;
            background-image: linear-gradient(to right, #ccc, #333, #ccc);
            margin: 20px 0;
        }
    </style>
</head>
<body>
    <a href="/index.html" class="home-button">Main Page</a>
    <h1>Best Pitchers by Pitch for 20{{ year }}</h1>
    <p> On this Page are Tables, that show the different pitchers for each pitch type. The boundary for the number of pitches is 200, because otherwise could it be possible, that pitchers are represented, who disturb the order. 
    The pitchers are order after their average pitch score for the corresponding pitch type. Further displayed is the number of pitches. The chosen features that are provided as well in the table, show the average release speed, the average release spin rate, 
    and the average velocity of the pitch, in feet per second, in z-dimension. 
    </p>
    <div>
        {% for pitch_type, pitchers in data.items() %}
            <h2>Pitch Type: {{ pitch_type }}</h2>
            <table>
                <tr>
                    <th>Pitcher Name</th>
                    <th>Avg. Pitch Score</th>
                    <th>Number of Pitches</th>
                    <th>Avg. Release Speed</th>
                    <th>Avg. Release Spin Rate</th>
                    <th>Avg. vz0</th>
                </tr>
                {% for pitcher in pitchers %}
                    <tr>
                        <td>{{ pitcher['Pitcher Name'] }}</td>
                        <td>{{ pitcher['pitch_score_ALL'] }}</td>
                        <td>{{ pitcher['Number of Pitches'] }}</td>
                        <td>{{ pitcher['Statistic 1'] }}</td>
                        <td>{{ pitcher['Statistic 2'] }}</td>
                        <td>{{ pitcher['Statistic 3'] }}</td>
                    </tr>
                {% endfor %}
            </table>
            <hr class="separator">
        {% endfor %}
    </div>
</body>
</html>
"""

def filter_pitchers_data(pitchers_dict, pitch_types_to_include):
    filtered_data = {pitch_type: [] for pitch_type in pitch_types_to_include}

    for key, df in pitchers_dict.items():
        for pitch_type in pitch_types_to_include:
            pitch_df = df[df['pitch_type'] == pitch_type]
            if len(pitch_df) < 200:
                continue
            if not pitch_df.empty:
                avg_pitch_score = pitch_df['pitch_score_ALL'].mean()
                num_pitches = len(pitch_df)
                stat1 = pitch_df['release_speed'].mean()
                stat2 = pitch_df['release_spin_rate'].mean()
                stat3 = pitch_df['vz0'].mean()

                pitcher_entry = {
                    "Pitcher Name": key,
                    "pitch_score_ALL": round(avg_pitch_score, 4),
                    "Number of Pitches": num_pitches,
                    "Statistic 1": round(stat1, 4),
                    "Statistic 2": round(stat2, 4), 
                    "Statistic 3": round(stat3, 4)
                }

                filtered_data[pitch_type].append(pitcher_entry)

    for pitch_type in filtered_data:
        filtered_data[pitch_type].sort(key = lambda x: x['pitch_score_ALL'], reverse = True)

    return filtered_data
        

def generate_html_for_year(year, base_dir, pitch_types):

    file_path = os.path.join(base_dir, f'pitchers_dict{year}.joblib')
    pitchers_dict = joblib.load(file_path)

    filtered_data = filter_pitchers_data(pitchers_dict, pitch_types)

    # Generate the content using Jinja2 template
    html_content = Template(html_template).render(year = year, data = filtered_data)

    # display(HTML(html_content))

    # Save the HTML content to a file
    output_dir = "C:\\Users\\noahb\\Desktop\\GithubPagesFiles\\Pitching_Pages"
    output_year = "20" + year
    output_dir = os.path.join(output_dir, output_year)
    os.makedirs(output_dir, exist_ok=True)
    html_file_path = os.path.join(output_dir, 'best-pitchers-by-pitch.html')

    with open(html_file_path, 'w') as html_file:
        html_file.write(html_content)

    print(f"HTML file created: {html_file_path}")


# Generate HTML files for all years
for year in years:
    generate_html_for_year(year, base_dir, pitch_types)
    print(f"Generated .html for {year}")

MemoryError: Unable to allocate 279. MiB for an array with shape (48, 763191) and data type float64

In [41]:
pitches21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 763191 entries, 0 to 763190
Data columns (total 94 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Unnamed: 0                       763191 non-null  int64  
 1   pitch_type                       744028 non-null  object 
 2   game_date                        763191 non-null  object 
 3   release_speed                    744028 non-null  float64
 4   release_pos_x                    743818 non-null  float64
 5   release_pos_z                    743818 non-null  float64
 6   player_name                      763191 non-null  object 
 7   batter                           763191 non-null  int64  
 8   pitcher                          763191 non-null  int64  
 9   events                           200060 non-null  object 
 10  description                      763191 non-null  object 
 11  spin_dir                         0 non-null       float64
 12  sp