Import Libraries

In [58]:
import os
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
from sklearn import set_config

set_config(transform_output='pandas')

Set Path and Load Data

In [83]:
# Get the current working directory
dataset_dir = os.getcwd()

# Get the path to the SQLite database file
data_paths = [os.path.join(pth, f) for pth, dirs, files in os.walk(dataset_dir) for f in files]
database_path = next((path for path in data_paths if path.endswith('.sqlite')), None)
print(data_paths)

# Check if the database file was found
if database_path is None:
    raise FileNotFoundError("SQLite database file not found in the specified directory")

# Connect to the SQLite database
cnx = sqlite3.connect(database_path)

# Load the 'Player' table into a pandas DataFrame
df_player = pd.read_sql_query("SELECT * FROM Player", cnx)

# Load the 'Player_Attributes' table into a pandas DataFrame
df_player_attributes = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)

# Close the database connection
cnx.close()

# Merge the two datasets on 'player_api_id'
df_merged = pd.merge(df_player_attributes, df_player, on='player_api_id', how='inner')

# Remove duplicates based on 'player_fifa_api_id_x'
df_unique = df_merged.drop_duplicates(subset=['player_fifa_api_id_x'])

#Remove rows with missing values in the 'potential' column
df_unique = df_unique.dropna(subset=['potential'])

# Display the first few rows of the unique dataframe
df_unique.head()

['c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\app.py', 'c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\best_model.pkl', 'c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\database.db', 'c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\database.sqlite', 'c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\data_exploration_page.py', 'c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\fifa_players.ipynb', 'c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\fifa_stats.py', 'c:\\Users\\morit\\OneDrive\\Dokumente\\ESADE\\Term3\\AI Prototypes\\Assignment 1\\European Soccer Database\\playe

Unnamed: 0,id_x,player_fifa_api_id_x,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,gk_handling,gk_kicking,gk_positioning,gk_reflexes,id_y,player_name,player_fifa_api_id_y,birthday,height,weight
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,11.0,10.0,8.0,8.0,1,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
5,6,189615,155782,2016-04-21 00:00:00,74.0,76.0,left,high,medium,80.0,...,7.0,9.0,9.0,12.0,2,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
38,39,186170,162549,2016-01-07 00:00:00,65.0,67.0,right,medium,medium,64.0,...,11.0,12.0,9.0,13.0,3,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
64,65,140161,30572,2016-04-21 00:00:00,69.0,69.0,right,medium,medium,57.0,...,12.0,13.0,12.0,11.0,4,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
87,88,17725,23780,2015-12-24 00:00:00,70.0,70.0,right,medium,medium,46.0,...,6.0,16.0,12.0,11.0,5,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154


In [22]:
#check for Nan values in potential
df_unique['potential'].isnull().sum()

0

Data Preprocessing

In [93]:
# Select features and target variable
X = df_unique.drop(columns=[
    'id_x', 'player_fifa_api_id_x', 'player_api_id', 'date', 'id_y',
    'player_name', 'player_fifa_api_id_y', 'potential'
])
y = df_unique['potential']

# Convert 'birthday' to age
X['birthday'] = pd.to_datetime(X['birthday'])
current_date = pd.Timestamp.now()
X['age'] = (current_date - X['birthday']).dt.days / 365.25  # Convert age to years

# Drop 'birthday' as we now have 'age'
X = X.drop(columns=['birthday'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# define categorical features
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# preprocess categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value= -1))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Define and Train the Model

In [94]:
# Define the model with manually set hyperparameters
model = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=30, random_state=42)

# Create a pipeline that includes preprocessing and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5
print(f'Root Mean Squared Error: {rmse}')

# Save the model to a file
model_filename = 'best_model.pkl'
joblib.dump(pipeline, model_filename)
print(f'Model saved to {model_filename}')

Root Mean Squared Error: 2.382930454594749
Model saved to best_model.pkl


In [95]:
# look at feature importances
importances = pipeline.named_steps['model'].feature_importances_
features = X_train.columns
importances_df = pd.DataFrame({'feature': features, 'importance': importances})
importances_df = importances_df.sort_values(by='importance', ascending=False)
print(importances_df)


                feature  importance
0        overall_rating    0.231924
36          gk_reflexes    0.101199
10                curve    0.074809
14         acceleration    0.074057
6      heading_accuracy    0.049829
4              crossing    0.048477
27               vision    0.035780
26          positioning    0.027646
23           long_shots    0.024742
12         long_passing    0.020835
9             dribbling    0.020742
22             strength    0.018761
28            penalties    0.018742
16              agility    0.017550
11   free_kick_accuracy    0.016127
2   attacking_work_rate    0.015771
24           aggression    0.015426
33          gk_handling    0.015026
20              jumping    0.015005
3   defensive_work_rate    0.014185
29              marking    0.013986
30      standing_tackle    0.012745
32            gk_diving    0.010067
1        preferred_foot    0.008673
31       sliding_tackle    0.007957
18              balance    0.007878
39                  age    0

In [96]:
# create a list of the top 10 features
top_features = importances_df['feature'].head(10).tolist()

# Select only top features
X_train_filter = X_train[top_features]
X_val_filter = X_val[top_features]
X_test_filter = X_test[top_features]

# Define numerical features
numerical_features = X_train_filter.select_dtypes(include=['int64', 'float64']).columns

# define categorical features
categorical_features = X_train_filter.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# preprocess categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value= -1))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
''

''

In [97]:
# Define the model with manually set hyperparameters
model = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=30, random_state=42)

# Create a pipeline that includes preprocessing and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Train the model
pipeline.fit(X_train_filter, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_val_filter)
mse = mean_squared_error(y_val, y_pred)
rmse = mse ** 0.5
print(f'Root Mean Squared Error: {rmse}')

# Save the model to a file
model_filename = 'best_model.pkl'
joblib.dump(pipeline, model_filename)
print(f'Model saved to {model_filename}')

Root Mean Squared Error: 3.39980553561416
Model saved to best_model.pkl


In [98]:
# Generalization score
y_pred = pipeline.predict(X_test_filter)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'Root Mean Squared Error on Test Set: {rmse}')

Root Mean Squared Error on Test Set: 3.418481720582516


In [62]:
importances = pipeline.named_steps['model'].feature_names_in_
print(importances)


['num__overall_rating' 'num__gk_reflexes' 'num__curve' 'num__acceleration'
 'num__crossing' 'num__heading_accuracy' 'num__vision' 'num__positioning'
 'num__long_shots' 'num__dribbling']


Define the Streamlit App

In [109]:
%%writefile app.py

import streamlit as st
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from agents.agent import strenght_weakness_agent, potential_vs_rating

# Load the trained model
model = joblib.load('best_model.pkl')

# Set up page configuration
st.set_page_config(page_title="Soccer Player Potential Predictor", page_icon="⚽", layout="wide")

# Custom CSS for theming
st.markdown(
    """
    <style>
    .main {
        background-color: #f0f0f0;
        font-family: Arial, sans-serif;
    }
    .sidebar .sidebar-content {
        background-color: #004d66;
        color: white;
    }
    .stButton>button {
        background-color: #004d66;
        color: white;
        border-radius: 10px;
        border: none;
    }
    .stButton>button:hover {
        background-color: #00334d;
        color: white;
    }
    .stSlider>div>div {
        color: #004d66;
    }
    h1, h2, h3, h4, h5, h6 {
        color: #004d66;
    }
    .block-container {
        padding-top: 2rem;
    }
    </style>
    """,
    unsafe_allow_html=True
)

# Navigation with radio buttons for better visual appeal
st.sidebar.title("Navigation")
st.sidebar.markdown("### Go to:")
page = st.sidebar.radio(
    "",
    ["Introduction", "Data Exploration", "Predictions", "Feature Importance"]
)

# Initialize session state
if 'page' not in st.session_state:
    st.session_state.page = "Introduction"

# Set the current page based on user selection
st.session_state.page = page

# Introduction Page
if st.session_state.page == "Introduction":
    st.title("Welcome to the Soccer Player Potential Predictor App")
    st.write("""
    #### This app is designed to assist football scouts in evaluating and predicting the future potential of soccer players.
    By leveraging machine learning models and advanced data analytics, scouts can make informed decisions about player development and recruitment.

    **Key Features:**
    - **Data Exploration:** Upload player data to explore various attributes and statistics.
    - **Potential Prediction:** Predict the future potential of players based on their current attributes.
    - **Strengths and Weaknesses Analysis:** Get detailed insights into the strengths and weaknesses of players.
    - **Feature Importance:** Understand the importance of different attributes in predicting player potential.

    **How to Use:**
    - Navigate through different sections using the sidebar.
    - Explore player data in the Data Exploration section to visualize and analyze attributes.
    - Input player attributes manually in the Predictions section to get potential predictions.
    - View the importance of different features in the Feature Importance section.

    **Why Use This App?**
    This tool provides football scouts with:
    - Objective insights into player potential.
    - Data-driven analysis for better decision-making.
    - Enhanced ability to identify promising talent.


    **Get Started:**
    Use the sidebar to navigate to the different sections of the app and start exploring player data and predictions.
    """)

# Data Exploration Page
elif st.session_state.page == "Data Exploration":
    st.title("Data Exploration")
    # File uploader for data exploration
    uploaded_file = 'unique_player_data.csv'
    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file)
        
        # Sidebar filters
        st.sidebar.header("Filters")
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
        feature = st.sidebar.selectbox("Feature to analyze", numeric_columns)
        min_rating = st.sidebar.slider("Minimum Potential", int(df['potential'].min()), int(df['potential'].max()), int(df['potential'].min()))
        max_rating = st.sidebar.slider("Maximum Potential", int(df['potential'].min()), int(df['potential'].max()), int(df['potential'].max()))

        # Apply filters to the dataframe
        filtered_df = df[(df['potential'] >= min_rating) & (df['potential'] <= max_rating)]

        # Display filtered data
        st.write(f"Data filtered by Potential between {min_rating} and {max_rating}")
        st.write(filtered_df.head())

        # Interactive plots
        st.write(f"Analysis of {feature}")

        # Histogram
        st.write("Histogram")
        fig = px.histogram(filtered_df, x=feature)
        st.plotly_chart(fig)

        # Box Plot
        st.write("Box Plot")
        fig = px.box(filtered_df, y=feature)
        st.plotly_chart(fig)

        # Scatter Plot
        st.write("Scatter Plot")
        scatter_x = st.selectbox("X-axis", numeric_columns, index=0)
        scatter_y = st.selectbox("Y-axis", numeric_columns, index=1)
        fig = px.scatter(filtered_df, x=scatter_x, y=scatter_y)
        st.plotly_chart(fig)

        # Add a dropdown with Player names
        player_names = df['player_name'].unique()
        player_name = st.selectbox("Select Player", player_names, index=0)
        player_data = df[df['player_name'] == player_name]
        st.write("Player Data:")
        st.write(player_data)
        st.write(strenght_weakness_agent(player_data))

# Predictions Page
elif st.session_state.page == "Predictions":
    st.title("Predictions")
    st.header("Input Player Attributes Manually")
    st.write("The generalization score of the model is RSME = 3.42")

    # Input attributes for prediction. Attributes are the top 10 features
    importances = model.named_steps['model'].feature_names_in_
    attributes = {}
    for f in importances:
        attributes[f] = st.slider(f, value=50, min_value=0, max_value=100, step=1)
    

    # Predict button for manual input
    if st.button("Predict Potential"):
        input_data = pd.DataFrame([attributes])
        input_data.columns = [col.split('__')[-1] for col in input_data.columns.to_list()]
        prediction = model.predict(input_data)[0]
        rounded_prediction = round(prediction)
        st.subheader(f"Predicted Potential: {rounded_prediction}")
        st.write(potential_vs_rating({'overall_rating': input_data['overall_rating'][0], 'potential': rounded_prediction}))


# Feature Importance Page
elif st.session_state.page == "Feature Importance":
    st.title("Feature Importance")
    # Assuming you have a feature importance array or similar from your model
    importances = model.named_steps['model'].feature_importances_
    feature_names = model.named_steps['model'].feature_names_in_
    feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    feature_importance_df['feature'] = feature_importance_df['feature'].str.split('__').str[-1]
    feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=True) 
     
    # Plot feature importances
    st.write("Feature Importance Plot")
    fig = px.bar(feature_importance_df, x='importance', y='feature', orientation='h', color='importance', color_continuous_scale='Viridis')
    st.plotly_chart(fig)


Overwriting app.py
