In [75]:
import mysql_connection
import pandas as pd

# Function to convert string representation of overs to decimal
def convert_to_decimal(overs_str):
    if '.' in overs_str:
        overs, balls = overs_str.split('.')
        decimal_overs = int(overs) + int(balls) / 6
    else:
        decimal_overs = int(overs_str)
    return decimal_overs

def fetch_data(team, opposite_team, players, filter_field):
    # Get MySQL connection
    connection = mysql_connection.get_mysql_connection()

    if connection:
        # Initialize an empty list to store DataFrames for each player
        dfs = []

        # Loop through each player
        for player in players:
            if opposite_team == 'All':
                # SQL query to select specific fields from the table for the given player
                sql_query = f"""
                SELECT DISTINCT b.match_id, b.team, b.opposite_team, b.player, b.overs, b.{filter_field}, m.date
                FROM cricket_info.bowling AS b
                JOIN cricket_info.master AS m ON b.match_id = m.match_id
                WHERE b.team = %s AND b.player LIKE %s;
                """
                # Execute SQL query with parameters
                cursor = connection.cursor()
                cursor.execute(sql_query, (team, f'%{player}%'))
            
            else:
                # SQL query to select specific fields from the table for the given player
                sql_query = f"""
                SELECT DISTINCT b.match_id, b.team, b.opposite_team, b.player, b.overs, b.{filter_field}, m.date
                FROM cricket_info.bowling AS b
                JOIN cricket_info.master AS m ON b.match_id = m.match_id
                WHERE b.team = %s AND b.opposite_team = %s AND b.player LIKE %s;
                """
                # Execute SQL query with parameters
                cursor = connection.cursor()
                cursor.execute(sql_query, (team, opposite_team, f'%{player}%'))

            # Fetch all rows from the result set
            rows = cursor.fetchall()

            # Create DataFrame from fetched rows
            df = pd.DataFrame(rows, columns=['match_id', 'team', 'opposite_team', 'player', 'overs', filter_field, 'date'])

            # Print the column names
            print(f"Columns for player {player}: {df.columns}")

            # Convert 'runs' field to float
            df[filter_field] = df[filter_field].astype(float)

            # Apply the function to 'Overs' column
            df['overs'] = df['overs'].apply(convert_to_decimal)

            # Calculate total overs bowled in each match
            total_overs_per_match = df.groupby('match_id')['overs'].sum()

            # Calculate ratio of filter field to total overs for each match
            df['ratio'] = df[filter_field] / total_overs_per_match[df['match_id']].values

            # Append the DataFrame for the current player to the list
            dfs.append(df)

            # Close cursor
            cursor.close()

        # Close connection
        connection.close()
        
        return dfs
    else:
        print("Connection to the database failed.")
        return None


if __name__ == "__main__":
    team = 'New Zealand'
    opposite_team = 'Sri Lanka'
    # opposite_team = 'All'
    players = ['Tim Southee']
    filter_field = 'runs'

    # Fetch data for multiple players
    dfs = fetch_data(team, opposite_team, players, filter_field)

    # Merge all players DataFrames into master_df
    master_df = pd.concat(dfs, ignore_index=True)

    master_df['date'] = pd.to_datetime(master_df['date'])
    
    # Sort DataFrame by 'date'
    master_df = master_df.sort_values(by='date', ascending=True)

    # # Plot data for multiple players
    # plot_data(master_df, players)

print(master_df)


Connected to MySQL database
Columns for player Tim Southee: Index(['match_id', 'team', 'opposite_team', 'player', 'overs', 'runs', 'date'], dtype='object')
      match_id         team opposite_team       player  overs  runs  \
3   T20I # 151  New Zealand     Sri Lanka  Tim Southee      4  21.0   
6   T20I # 180  New Zealand     Sri Lanka  Tim Southee      3  10.0   
5   T20I # 181  New Zealand     Sri Lanka  Tim Southee      1  15.0   
4   T20I # 275  New Zealand     Sri Lanka  Tim Southee      4  44.0   
8   T20I # 718  New Zealand     Sri Lanka  Tim Southee      2  21.0   
7   T20I # 878  New Zealand     Sri Lanka  Tim Southee      4  20.0   
1   T20I # 879  New Zealand     Sri Lanka  Tim Southee      4  18.0   
2   T20I # 880  New Zealand     Sri Lanka  Tim Southee      4  16.0   
0  T20I # 1850  New Zealand     Sri Lanka  Tim Southee      4  12.0   

        date      ratio  
3 2010-04-30   5.250000  
6 2010-05-22   3.333333  
5 2010-05-23  15.000000  
4 2012-09-27  11.000000  
8 2

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# Assuming you have the 'master_df' DataFrame containing the data
# You've already provided the data in the correct format

# Splitting the data into features (X) and target variable (y)
X = master_df[['overs', 'runs']]  # Features
y = master_df['ratio']             # Target

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize regression models
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Regression": make_pipeline(StandardScaler(), SVR()),
    "Random Forest Regression": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=42),
    "Decision Tree Regression": DecisionTreeRegressor(random_state=42),
    "Neural Network Regression": keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(2,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
}

# Train and evaluate models
for name, model in models.items():
    if name == "Neural Network Regression":
        model.compile(optimizer='adam', loss='mean_squared_error')
        history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
        predictions = model.predict(X_test).flatten()
        print(f'{name} Predictions: {predictions}')
        mse = mean_squared_error(y_test, predictions)
        print(f'{name} Mean Squared Error: {mse}')
        rmse = np.sqrt(mse)
        print(f'{name} RMSE: {rmse}')
        print(f'\n')

    else:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        print(f'{name} Predictions: {predictions}')
        mse = mean_squared_error(y_test, predictions)
        print(f'{name} Mean Squared Error: {mse}')
        rmse = np.sqrt(mse)
        print(f'{name} RMSE: {rmse}')
        print(f'\n')


Linear Regression Predictions: [3.95791771 5.93298005]
Linear Regression Mean Squared Error: 3.3799669851590193
Linear Regression RMSE: 1.838468652210045


Support Vector Regression Predictions: [4.99265958 6.55046649]
Support Vector Regression Mean Squared Error: 5.667659394409081
Support Vector Regression RMSE: 2.3806846482491295


Random Forest Regression Predictions: [5.49  8.325]
Random Forest Regression Mean Squared Error: 13.56841805555555
Random Forest Regression RMSE: 3.6835333656091063


Gradient Boosting Regression Predictions: [ 4.50020132 12.36155137]
Gradient Boosting Regression Mean Squared Error: 40.87946117892694
Gradient Boosting Regression RMSE: 6.393704808554031


Decision Tree Regression Predictions: [ 4.5 15. ]
Decision Tree Regression Mean Squared Error: 68.18055555555554
Decision Tree Regression RMSE: 8.257151782276715


Neural Network Regression Predictions: [4.7161126 2.4366243]
Neural Network Regression Mean Squared Error: 0.6584521932244376
Neural Network Re

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# Assuming you have the 'master_df' DataFrame containing the data
# You've already provided the data in the correct format

# Feature Engineering: Extracting relevant information from the 'date'
master_df['day'] = master_df['date'].dt.day
master_df['month'] = master_df['date'].dt.month
master_df['year'] = master_df['date'].dt.year

# Splitting the data into features (X) and target variable (y)
X = master_df[['overs', 'runs', 'day', 'month', 'year']]  # Features
y = master_df['ratio']                                     # Target

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize regression models
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Regression": make_pipeline(StandardScaler(), SVR()),
    "Random Forest Regression": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=42),
    "Decision Tree Regression": DecisionTreeRegressor(random_state=42),
    "Neural Network Regression": keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(5,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
}

# Train and evaluate models
for name, model in models.items():
    if name == "Neural Network Regression":
        model.compile(optimizer='adam', loss='mean_squared_error')
        history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
        predictions = model.predict(X_test).flatten()
        print(f'{name} Predictions: {predictions}')
        mse = mean_squared_error(y_test, predictions)
        print(f'{name} Mean Squared Error: {mse}')
        rmse = np.sqrt(mse)
        print(f'{name} RMSE: {rmse}')
        print(f'\n')
    else:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        print(f'{name} Predictions: {predictions}')
        mse = mean_squared_error(y_test, predictions)
        print(f'{name} Mean Squared Error: {mse}')
        rmse = np.sqrt(mse)
        print(f'{name} RMSE: {rmse}')
        print(f'\n')


Linear Regression Predictions: [4.10395671 6.75444302]
Linear Regression Mean Squared Error: 5.857399248903028
Linear Regression RMSE: 2.4202064475790133


Support Vector Regression Predictions: [4.8655555  5.98678568]
Support Vector Regression Mean Squared Error: 3.894997836970061
Support Vector Regression RMSE: 1.9735748876011927


Random Forest Regression Predictions: [ 5.785  10.2125]
Random Forest Regression Mean Squared Error: 25.25457951388889
Random Forest Regression RMSE: 5.0253934685643165


Gradient Boosting Regression Predictions: [ 4.50012522 14.1368286 ]
Gradient Boosting Regression Mean Squared Error: 58.482817590957126
Gradient Boosting Regression RMSE: 7.647405938679934


Decision Tree Regression Predictions: [ 4.5 15. ]
Decision Tree Regression Mean Squared Error: 68.18055555555554
Decision Tree Regression RMSE: 8.257151782276715


Neural Network Regression Predictions: [8.579054 8.053057]
Neural Network Regression Mean Squared Error: 21.62176162118663
Neural Network 

In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# Assuming you have the 'master_df' DataFrame containing the data
# You've already provided the data in the correct format

# Feature Engineering: Extracting relevant information from the 'date'
master_df['day'] = master_df['date'].dt.day
master_df['month'] = master_df['date'].dt.month
master_df['year'] = master_df['date'].dt.year

# Splitting the data into features (X) and target variable (y)
X = master_df[['team', 'opposite_team', 'overs', 'runs', 'day', 'month', 'year']]  # Features
y = master_df['ratio']                                                             # Target

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: One-hot encode categorical features and scale numeric features
numeric_features = ['overs', 'runs', 'day', 'month', 'year']
categorical_features = ['team', 'opposite_team']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialize regression models
models = [
    ('Linear Regression', LinearRegression()),
    ('Support Vector Regression', SVR()),
    ('Random Forest Regression', RandomForestRegressor(random_state=42)),
    ('Gradient Boosting Regression', GradientBoostingRegressor(random_state=42)),
    ('Decision Tree Regression', DecisionTreeRegressor(random_state=42)),
]

# Train and evaluate models
for name, model in models:
    regressor = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', model)])
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_test)
    print(f'{name} Predictions: {predictions}')
    mse = mean_squared_error(y_test, predictions)
    print(f'{name} Mean Squared Error: {mse}')
    rmse = mse ** 0.5
    print(f'{name} RMSE: {rmse}')
    print(f'\n')


# Neural Network Regression
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_processed.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(X_train_processed, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
predictions = model.predict(X_test_processed).flatten()
print(f'Neural Network Regression Predictions: {predictions}')
mse = mean_squared_error(y_test, predictions)
print(f'Neural Network Regression Mean Squared Error: {mse}')
rmse = mse ** 0.5
print(f'Neural Network Regression RMSE: {rmse}')
print(f'\n')

Linear Regression Predictions: [4.10395671 6.75444302]
Linear Regression Mean Squared Error: 5.857399248903022
Linear Regression RMSE: 2.4202064475790124


Support Vector Regression Predictions: [4.83697126 5.93111689]
Support Vector Regression Mean Squared Error: 3.724500157996721
Support Vector Regression RMSE: 1.92989641120883


Random Forest Regression Predictions: [ 5.715 10.415]
Random Forest Regression Mean Squared Error: 26.54561388888888
Random Forest Regression RMSE: 5.152243578179207


Gradient Boosting Regression Predictions: [ 4.50012522 14.62641435]
Gradient Boosting Regression Mean Squared Error: 63.89190207909517
Gradient Boosting Regression RMSE: 7.993241024709262


Decision Tree Regression Predictions: [ 4.5 15. ]
Decision Tree Regression Mean Squared Error: 68.18055555555554
Decision Tree Regression RMSE: 8.257151782276715


Neural Network Regression Predictions: [ 4.9296565 10.695584 ]
Neural Network Regression Mean Squared Error: 27.53350023651987
Neural Network Re