In [38]:
import mysql_connection
import pandas as pd

# Function to convert string representation of overs to decimal
def convert_to_decimal(overs_str):
    if '.' in overs_str:
        overs, balls = overs_str.split('.')
        decimal_overs = int(overs) + int(balls) / 6
    else:
        decimal_overs = int(overs_str)
    return decimal_overs

def fetch_data(team, opposite_team, players, filter_field):
    # Get MySQL connection
    connection = mysql_connection.get_mysql_connection()

    if connection:
        # Initialize an empty list to store DataFrames for each player
        dfs = []

        # Loop through each player
        for player in players:
            if opposite_team == 'All':
                # SQL query to select specific fields from the table for the given player
                sql_query = f"""
                SELECT DISTINCT b.match_id, b.team, b.opposite_team, b.player, b.overs, b.{filter_field}, m.date
                FROM cricket_info.bowling AS b
                JOIN cricket_info.master AS m ON b.match_id = m.match_id
                WHERE b.team = %s AND b.player LIKE %s;
                """
                # Execute SQL query with parameters
                cursor = connection.cursor()
                cursor.execute(sql_query, (team, f'%{player}%'))
            
            else:
                # SQL query to select specific fields from the table for the given player
                sql_query = f"""
                SELECT DISTINCT b.match_id, b.team, b.opposite_team, b.player, b.overs, b.{filter_field}, m.date
                FROM cricket_info.bowling AS b
                JOIN cricket_info.master AS m ON b.match_id = m.match_id
                WHERE b.team = %s AND b.opposite_team = %s AND b.player LIKE %s;
                """
                # Execute SQL query with parameters
                cursor = connection.cursor()
                cursor.execute(sql_query, (team, opposite_team, f'%{player}%'))

            # Fetch all rows from the result set
            rows = cursor.fetchall()

            # Create DataFrame from fetched rows
            df = pd.DataFrame(rows, columns=['match_id', 'team', 'opposite_team', 'player', 'overs', filter_field, 'date'])

            # Print the column names
            print(f"Columns for player {player}: {df.columns}")

            # Convert 'runs' field to float
            df[filter_field] = df[filter_field].astype(float)

            # Apply the function to 'Overs' column
            df['overs'] = df['overs'].apply(convert_to_decimal)

            # Calculate total overs bowled in each match
            total_overs_per_match = df.groupby('match_id')['overs'].sum()

            # Calculate ratio of filter field to total overs for each match
            df['ratio'] = df[filter_field] / total_overs_per_match[df['match_id']].values

            # Append the DataFrame for the current player to the list
            dfs.append(df)

            # Close cursor
            cursor.close()

        # Close connection
        connection.close()
        
        return dfs
    else:
        print("Connection to the database failed.")
        return None


if __name__ == "__main__":
    team = 'England'
    opposite_team = 'Sri Lanka'
    # opposite_team = 'All'
    players = ['Adil Rashid']
    filter_field = 'runs'

    # Fetch data for multiple players
    dfs = fetch_data(team, opposite_team, players, filter_field)

    # Merge all players DataFrames into master_df
    master_df = pd.concat(dfs, ignore_index=True)

    master_df['date'] = pd.to_datetime(master_df['date'])
    
    # Sort DataFrame by 'date'
    master_df = master_df.sort_values(by='date', ascending=True)

    # # Plot data for multiple players
    # plot_data(master_df, players)

print(master_df)


Connected to MySQL database
Columns for player Adil Rashid: Index(['match_id', 'team', 'opposite_team', 'player', 'overs', 'runs', 'date'], dtype='object')
      match_id     team opposite_team       player  overs  runs       date  \
6   T20I # 551  England     Sri Lanka  Adil Rashid      2  31.0 2016-03-26   
0   T20I # 561  England     Sri Lanka  Adil Rashid      4  25.0 2016-07-05   
7   T20I # 703  England     Sri Lanka  Adil Rashid      4  11.0 2018-10-27   
4  T20I # 1165  England     Sri Lanka  Adil Rashid      4  17.0 2021-06-23   
3  T20I # 1168  England     Sri Lanka  Adil Rashid      4  24.0 2021-06-24   
1  T20I # 1174  England     Sri Lanka  Adil Rashid      1   9.0 2021-06-26   
5  T20I # 1382  England     Sri Lanka  Adil Rashid      4  19.0 2021-11-01   
2  T20I # 1867  England     Sri Lanka  Adil Rashid      4  16.0 2022-11-05   

   ratio  
6  15.50  
0   6.25  
7   2.75  
4   4.25  
3   6.00  
1   9.00  
5   4.75  
2   4.00  


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# Assuming you have the 'master_df' DataFrame containing the data
# You've already provided the data in the correct format

# Splitting the data into features (X) and target variable (y)
X = master_df[['overs', 'runs']]  # Features
y = master_df['ratio']             # Target

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize regression models
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Regression": make_pipeline(StandardScaler(), SVR()),
    "Random Forest Regression": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=42),
    "Neural Network Regression": keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(2,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
}

# Train and evaluate models
for name, model in models.items():
    if name == "Neural Network Regression":
        model.compile(optimizer='adam', loss='mean_squared_error')
        history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
        predictions = model.predict(X_test).flatten()
        print(f'{name} Predictions: {predictions}')
        mse = mean_squared_error(y_test, predictions)
        print(f'{name} Mean Squared Error: {mse}')
        rmse = np.sqrt(mse)
        print(f'{name} RMSE: {rmse}')
        print(f'\n')

    else:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        print(f'{name} Predictions: {predictions}')
        mse = mean_squared_error(y_test, predictions)
        print(f'{name} Mean Squared Error: {mse}')
        rmse = np.sqrt(mse)
        print(f'{name} RMSE: {rmse}')
        print(f'\n')


Linear Regression Predictions: [ 6.25  13.875]
Linear Regression Mean Squared Error: 11.882812499999973
Linear Regression RMSE: 3.4471455582844155


Support Vector Regression Predictions: [5.95219802 5.40514919]
Support Vector Regression Mean Squared Error: 6.505819164391004
Support Vector Regression RMSE: 2.550650733517038


Random Forest Regression Predictions: [5.8025 7.28  ]
Random Forest Regression Mean Squared Error: 1.5793281249999995
Random Forest Regression RMSE: 1.2567132230544882


Gradient Boosting Regression Predictions: [ 5.99998749 10.49366444]
Gradient Boosting Regression Mean Squared Error: 1.1467698520379788
Gradient Boosting Regression RMSE: 1.070873406168058


Neural Network Regression Predictions: [8.959228  3.4777777]
Neural Network Regression Mean Squared Error: 18.91742644885474
Neural Network Regression RMSE: 4.349416794106394


