<a href="https://colab.research.google.com/github/loganbnelson/Fantasy-Football/blob/main/NFL_Scrape_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import pandas as pd
import requests
import os
from google.colab import drive
from datetime import date
import copy

In [33]:
# Checking to see if the code here is being run in Google Colab and mounts the drive as will be needed.

try:
    __IPYTHON__
    _in_ipython_session = True
    drive.mount('/content/drive')
except NameError:
    _in_ipython_session = False

try:
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Define Google Drive base path if in Colab, otherwise current directory
if IN_COLAB:
    drive.mount('/content/drive')
    gdrive_path = '/content/drive/MyDrive'
else:
    gdrive_path = '.'  # Current directory

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
# Attempt to create a directory named "Fantasy_data" in the current working directory.

try:
    os.mkdir('Fantasy_data')  # Try to create the directory "Fantasy_data"
except OSError as error:
    # If an error occurs during directory creation, catch the exception and handle it.
    # Print the error message associated with the OSError.
    print(error)


[Errno 17] File exists: 'Fantasy_data'


In [35]:
# BASE_URL = 'https://www.fftoday.com/stats/playerstats.php?Season='
# seasons_desired = list(range(2013, 2025))
# gameweekcombos = list(range(1, 19))
# pages = list(range(3))
# PosIDs = {'QB': 10, 'RB': 20, 'WR': 30, 'TE': 40, 'K': 80, 'DEF': 99}
# PosIDvalues = PosIDs.values()
# combinedvaluesdict = {}
# iteration = seasons_desired[0]
# urlstopull = []

In [36]:
# Calculate the current year and 10 years prior
current_year = date.today().year
ten_years_ago = current_year - 10

# Constants
BASE_URL = 'https://www.fftoday.com/stats/playerstats.php?Season='
SEASON_START = ten_years_ago
SEASON_END = current_year

# Lists and Dictionaries
seasons_desired = list(range(SEASON_START, SEASON_END + 1))
gameweekcombos = list(range(1, 19))
pages = list(range(3))
position_ids = {'QB': 10, 'RB': 20, 'WR': 30, 'TE': 40, 'K': 80, 'DEF': 99}

# Initialize variables
iteration = seasons_desired[0]
urlstopull = []
combinedvaluesdict = {}
position_tables = {}
position_season_week = []

In [37]:
def fetch_data(url):
    """
    Fetches data from a given URL.

    Args:
        url (str): The URL to fetch data from.

    Returns:
        pd.DataFrame or None: A DataFrame containing the fetched data if successful,
        or None if an error occurs.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        tables = pd.read_html(response.text)
        if len(tables) >= 8:
            return tables[7]
        else:
            return None
    except requests.exceptions.RequestException as e:
        raise Exception(f"Error fetching data from {url}: {str(e)}")

def generate_urls(base_url, seasons, gameweeks):
    """
    Generate a list of URLs based on seasons and gameweeks.

    Args:
        base_url (str): The base URL for data retrieval.
        seasons (list): A list of seasons to include in the URLs.
        gameweeks (list): A list of gameweeks to include in the URLs.

    Returns:
        list: A list of generated URLs.
    """
    urls_to_pull = []

    for season in seasons:
        urls_to_pull.extend([f"{base_url}{season}&GameWeek={game}" for game in gameweeks])

    return urls_to_pull

def generate_combined_values_list(iteration, gameweeks):
    """
    Generate a list of combined values.

    Args:
        iteration (int): The current iteration value.
        gameweeks (list): A list of gameweeks.

    Returns:
        list: A list of combined values.
    """
    combined_values_list = []

    for game in gameweeks:
        combined_values_list.append([iteration, game])

    return combined_values_list

def generate_urls_with_pos(base_urls, pos_id):
    """
    Generate a list of URLs with a specified position ID.

    Args:
        base_urls (list): A list of base URLs.
        pos_id (int): The position ID to include in the URLs.

    Returns:
        list: A list of generated URLs with the specified position ID.
    """
    urls_with_pos = []

    for url in base_urls:
        urls_with_pos.append(f"{url}&PosID={pos_id}")

    return urls_with_pos

# Define a function to fetch and process data for a specific position
def fetch_and_process_position_data(position, urls, combined_values_list):
    table_list = []

    for iteration, url in enumerate(urls):
        table_temp = pd.read_html(url)
        final_table_pull = table_temp[7].iloc[2:, :].copy()  # Create a deep copy

        if not final_table_pull.empty:  # Check if the DataFrame is not empty
            final_table_pull.loc[:, 'Position'] = position
            final_table_pull.loc[:, 'Season'] = combined_values_list[iteration][1]
            final_table_pull.loc[:, 'Week'] = combined_values_list[iteration][2]
            table_list.append(final_table_pull)

    if table_list:
        return pd.concat(table_list, ignore_index=True)
    else:
        return None  # Return None if no data was collected

def clean_dataframe(df, position):
    """
    Cleans and prepares the DataFrame for a specific position.

    Args:
        df (pd.DataFrame): The DataFrame to clean.
        position (str): The player position.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    if df is None:
        return None

    df = df.copy()  # Avoid modifying the original DataFrame

    # Standardize column names (lowercase and remove special characters/spaces)
    df.columns = [col.lower().replace(' ', '_').replace('.', '') for col in df.columns]

    # Add a 'player_name' column (assuming the first column is the name)
    if df.columns[0] == 'player':
        df.rename(columns={'player': 'player_name'}, inplace=True)
    elif 'name' in df.columns:
        df.rename(columns={'name': 'player_name'}, inplace=True)
    elif 'player_name' not in df.columns and len(df.columns) > 0:
        df.rename(columns={df.columns[0]: 'player_name'}, inplace=True)

    # Try to convert relevant columns to numeric, handling errors
    numeric_cols_to_try = df.columns.drop(['player_name', 'position'])
    for col in numeric_cols_to_try:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        except Exception as e:
            print(f"Could not convert column '{col}' for {position} to numeric: {e}")

    # Specific cleaning based on position (you might need to adjust these)
    if position == 'QB':
        # Example QB-specific cleaning: remove any rank columns
        df = df[df.columns.drop(list(df.filter(regex='rank')))]
    elif position == 'RB':
        pass # Add RB-specific cleaning if needed
    elif position == 'WR':
        pass # Add WR-specific cleaning if needed
    elif position == 'TE':
        pass # Add TE-specific cleaning if needed
    elif position == 'K':
        pass # Add K-specific cleaning if needed
    elif position == 'DEF':
        pass # Add DEF-specific cleaning if needed

    return df

def save_to_csv(df, position):
    """
    Saves the DataFrame to a CSV file.  If running in Colab, saves to Google Drive.
    Otherwise, saves to the current directory.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        position (str): The player position, used in the filename.
    """
    if df is not None and not df.empty:
        filename = f"fantasy_football_{position}_stats.csv"
        filepath = os.path.join(gdrive_path, 'Fantasy_data', filename)
        # Ensure the directory exists
        os.makedirs(os.path.dirname(filepath), exist_ok=True)  # Create directory including gdrive_path
        try:
            df.to_csv(filepath, index=False)
            print(f"Successfully saved {position} data to: {filepath}")
        except Exception as e:
            print(f"Error saving {position} data to CSV: {e}")
    else:
        print(f"No data to save for position: {position}")

def clean_player_names(df_dict):
    """
    Cleans the player names in a dictionary of pandas DataFrames.

    Args:
        df_dict (dict): A dictionary where keys are player positions (strings)
                         and values are pandas DataFrames of weekly player performances.
                         Each DataFrame's first column contains the weekly rank,
                         followed by a period, space, and the player's full name.

    Returns:
        dict: A new dictionary with the same keys as the input, but with DataFrames
              where the first column has been cleaned to contain only the player names.
    """
    cleaned_df_dict = {}
    for position, df in df_dict.items():
        # Make a copy of the DataFrame to avoid modifying the original
        df_copy = df.copy()

        # Check if the DataFrame is not empty and has at least one column
        if not df_copy.empty and len(df_copy.columns) > 0:
            # Access the first column
            first_column = df_copy.iloc[:, 0]

            # Clean the names by splitting at the first period and space, then taking the second part
            cleaned_names = first_column.str.split(r'^\d+\.\s+', n=1, regex=True).str[1]

            # Update the first column with the cleaned names
            df_copy.iloc[:, 0] = cleaned_names

        cleaned_df_dict[position] = df_copy
    return cleaned_df_dict

In [38]:
# Generate URLs for each season and game week
urlstopull = generate_urls(BASE_URL, seasons_desired, gameweekcombos)

### Unsure if needed ###
combinedvalueslist = generate_combined_values_list(seasons_desired, gameweekcombos)

# Generate URLs for different positions using position_ids dictionary
urlstopullQB = generate_urls_with_pos(urlstopull, position_ids['QB'])
urlstopullRB = generate_urls_with_pos(urlstopull, position_ids['RB'])
urlstopullWR = generate_urls_with_pos(urlstopull, position_ids['WR'])
urlstopullTE = generate_urls_with_pos(urlstopull, position_ids['TE'])
urlstopullDEF = generate_urls_with_pos(urlstopull, position_ids['DEF'])
urlstopullK = generate_urls_with_pos(urlstopull, position_ids['K'])

In [39]:
#urlstopullQB

['https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=1&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=2&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=3&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=4&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=5&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=6&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=7&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=8&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=9&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=10&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=11&PosID=10',
 'https://www.fftoday.com/stats/playerstats.php?Season=2015&GameWeek=12&PosID=10',
 'https://www

In [40]:
# Create sessions for requests
session = requests.Session()

# Define position-specific URLs
position_urls = {
    'QB': urlstopullQB,
    'RB': urlstopullRB,
    'WR': urlstopullWR,
    'TE': urlstopullTE,
    'K': urlstopullK,
    'DEF': urlstopullDEF
}

for position, pos_id in position_ids.items():
    for season in seasons_desired:
        for week in gameweekcombos:
            position_season_week.append([pos_id, season, week])

In [41]:
# Fetch and process data for each position
for position, urls in position_urls.items():
    position_tables[position] = fetch_and_process_position_data(position, urls, position_season_week)

In [42]:
position_tables_clean = clean_player_names(position_tables)

In [43]:
position_tables_clean

{'QB':                    0    1  2   3   4    5  6  7  8   9 10    11    12  \
 0          Tom Brady   NE  1  25  32  288  4  0  3   1  0  30.5  30.5   
 1          Tony Romo  DAL  1  36  45  356  3  2  1  -1  0  29.7  29.7   
 2      Carson Palmer  ARI  1  19  32  307  3  0  3  14  0  28.8  28.8   
 3      Philip Rivers   SD  1  35  42  404  2  2  2  -2  0  28.0  28.0   
 4     Marcus Mariota  TEN  1  13  16  209  4  0  2   6  0  27.1  27.1   
 ...              ...  ... ..  ..  ..  ... .. .. ..  .. ..   ...   ...   
 6184  Desmond Ridder   LV  1   1   2   11  0  0  0   0  0   0.6   0.6   
 6185  Chris Oladokun   KC  1   0   0    0  0  0  1   5  0   0.5   0.5   
 6186    Clayton Tune  ARI  1   0   0    0  0  0  1   2  0   0.2   0.2   
 6187      Josh Allen  BUF  1   0   0    0  0  0  0   0  0   0.0   0.0   
 6188      Drake Maye   NE  1   0   1    0  0  0  0   0  0   0.0   0.0   
 
      Position  Season  Week  
 0          QB    2015     1  
 1          QB    2015     1  
 2         

In [44]:
# Prepare and save data for each position
for position, df in position_tables_clean.items():
    print(f"Cleaning and saving data for position: {position}")
    # cleaned_df = clean_dataframe(df, position)
    save_to_csv(df, position)

Cleaning and saving data for position: QB
Successfully saved QB data to: /content/drive/MyDrive/Fantasy_data/fantasy_football_QB_stats.csv
Cleaning and saving data for position: RB
Successfully saved RB data to: /content/drive/MyDrive/Fantasy_data/fantasy_football_RB_stats.csv
Cleaning and saving data for position: WR
Successfully saved WR data to: /content/drive/MyDrive/Fantasy_data/fantasy_football_WR_stats.csv
Cleaning and saving data for position: TE
Successfully saved TE data to: /content/drive/MyDrive/Fantasy_data/fantasy_football_TE_stats.csv
Cleaning and saving data for position: K
Successfully saved K data to: /content/drive/MyDrive/Fantasy_data/fantasy_football_K_stats.csv
Cleaning and saving data for position: DEF
Successfully saved DEF data to: /content/drive/MyDrive/Fantasy_data/fantasy_football_DEF_stats.csv


Next steps: Headers aren't working and first column at minimum needs to be cleaned from numbers and whitespace.