In [1]:
import sqlite3
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pickle
import os

In [2]:
conn = sqlite3.connect("fifa23_dashboard/data/fifa23.db")

In [3]:
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)

                           name
0                       players
1  players_fifa23_with_clusters
2                players_fifa23


In [4]:
df = pd.read_sql_query("SELECT * FROM players_fifa23", conn)

In [5]:
df

Unnamed: 0,player_id,Known As,Full Name,Overall,Potential,Value(in Euro),Positions Played,Best Position,Nationality,Image Link,...,CM Rating,RM Rating,LWB Rating,CDM Rating,RWB Rating,LB Rating,CB Rating,RB Rating,GK Rating,GroupedPosition
0,1,L. Messi,Lionel Messi,91,91,54000000,RW,CAM,Argentina,https://cdn.sofifa.net/players/158/023/23_60.png,...,88,91,67,66,67,62,53,62,22,CAM
1,2,K. Benzema,Karim Benzema,91,91,64000000,"CF,ST",CF,France,https://cdn.sofifa.net/players/165/153/23_60.png,...,84,89,67,67,67,63,58,63,21,ST_CF
2,3,R. Lewandowski,Robert Lewandowski,91,91,84000000,ST,ST,Poland,https://cdn.sofifa.net/players/188/545/23_60.png,...,83,86,67,69,67,64,63,64,22,ST_CF
3,4,K. De Bruyne,Kevin De Bruyne,91,91,107500000,"CM,CAM",CM,Belgium,https://cdn.sofifa.net/players/192/985/23_60.png,...,91,91,82,82,82,78,72,78,24,CM
4,5,K. Mbappé,Kylian Mbappé,91,95,190500000,"ST,LW",ST,France,https://cdn.sofifa.net/players/231/747/23_60.png,...,84,92,70,66,70,66,57,66,21,ST_CF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,18535,D. Collins,Darren Collins,47,56,110000,"ST,RM",CAM,Republic of Ireland,https://cdn.sofifa.net/players/243/725/23_60.png,...,44,50,41,38,41,40,36,40,15,CAM
18535,18536,Yang Dejiang,Dejiang Yang,47,57,90000,CDM,CDM,China PR,https://cdn.sofifa.net/players/261/933/23_60.png,...,45,45,47,48,47,49,49,49,15,CDM
18536,18537,L. Mullan,Liam Mullan,47,67,130000,CM,RM,Northern Ireland,https://cdn.sofifa.net/players/267/823/23_60.png,...,49,52,46,44,46,46,42,46,17,RM_RW
18537,18538,D. McCallion,Daithí McCallion,47,61,100000,CB,CB,Republic of Ireland,https://cdn.sofifa.net/players/267/824/23_60.png,...,33,33,44,42,44,47,49,47,15,CB


In [6]:
conn.close()

In [7]:
top5_features_by_position = {
    "ST_CF": ["Finishing", "Positioning", "BallControl", "Shot Power", "Dribbling"],
    "CAM": ["BallControl", "Dribbling", "Short Passing", "Physicality Total", "Vision"],
    "CM": ["Short Passing", "BallControl", "Vision", "LongPassing", "Defending Total"],
    "RM_RW": ["Crossing", "Dribbling", "Finishing", "Physicality Total", "Shooting Total"],
    "GK": ["Reactions", "Dribbling Total", "Goalkeeper Reflexes", "Goalkeeper Positioning", "Physicality Total"],
    "CB": ["Standing Tackle", "Marking", "Interceptions", "Strength", "Heading Accuracy"],
    "LM_LW": ["Dribbling", "BallControl", "Crossing", "Positioning", "Short Passing"],
    "CDM": ["Interceptions", "BallControl", "Marking", "Standing Tackle", "Short Passing"],
    "LB_LWB": ["Crossing", "Sliding Tackle", "Interceptions", "Standing Tackle", "Short Passing"],
    "RB_RWB": ["Crossing", "Sliding Tackle", "Interceptions", "Pace Total", "Standing Tackle"],
}

In [8]:
# Dictonary to save models and metrics
models = {}
metrics = {}

for position, features in top5_features_by_position.items():
    df_filtered = df[df["GroupedPosition"] == position].dropna(subset=features + ["Overall"])
    
    X = df_filtered[features]
    y = df_filtered["Overall"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    models[position] = model
    metrics[position] = {"RMSE": rmse, "R2": r2}

    print(f"\n📌 Model for the position: {position}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2:.2f}")


📌 Model for the position: ST_CF
RMSE: 1.18
R²: 0.97

📌 Model for the position: CAM
RMSE: 1.53
R²: 0.96

📌 Model for the position: CM
RMSE: 1.17
R²: 0.97

📌 Model for the position: RM_RW
RMSE: 1.80
R²: 0.92

📌 Model for the position: GK
RMSE: 1.00
R²: 0.98

📌 Model for the position: CB
RMSE: 1.13
R²: 0.97

📌 Model for the position: LM_LW
RMSE: 1.38
R²: 0.95

📌 Model for the position: CDM
RMSE: 1.34
R²: 0.95

📌 Model for the position: LB_LWB
RMSE: 1.38
R²: 0.95

📌 Model for the position: RB_RWB
RMSE: 1.30
R²: 0.95


In [9]:
def predict_overall(position, input_features_dict):
    if position not in models:
        return f"❌ Position '{position}' is not in the system."
    
    model = models[position]
    required_features = top5_features_by_position[position]
    
  
    missing = [feat for feat in required_features if feat not in input_features_dict]
    if missing:
        return f"❗ The following inputs are missing: {missing}"
    

    input_values = np.array([input_features_dict[feat] for feat in required_features]).reshape(1, -1)
    
    predicted_overall = model.predict(input_values)[0]
    return f"✅ Overall expected to {position}: {predicted_overall:.0f}"

In [12]:

input_data = {
    "Interceptions": 83,
    "BallControl": 87,
    "Marking": 88,
    "Standing Tackle": 89,
    "Short Passing": 90
}

print(predict_overall("CDM", input_data))

✅ Overall expected to CDM: 87




In [41]:
if not os.path.exists("models"):
    os.makedirs("models")

In [42]:
for position, model in models.items():
    file_name = f"models/model_{position}.pkl"
    with open(file_name, "wb") as f:
        pickle.dump(model, f)

print("✅ Todos os modelos foram salvos com sucesso!")

✅ Todos os modelos foram salvos com sucesso!
