In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [4]:
## scale the features
scaler = MinMaxScaler()

X_norm = scaler.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [6]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [7]:
rf = RandomForestRegressor(random_state=42)

In [8]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

In [9]:
grid_search.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [10]:
best_params = grid_search.best_params_
print(f'Best parameters found: {best_params}')

Best parameters found: {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [11]:
# Train the model with the best parameters
best_rf = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    bootstrap=best_params['bootstrap'],
    random_state=42
)

In [12]:
best_rf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [13]:
# Predict the target values for the test set
predicted_ratings_best = best_rf.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared (R²)
mse_best = mean_squared_error(y_test, predicted_ratings_best)
r_squared_best = r2_score(y_test, predicted_ratings_best)

# Print the results
print("Improved Random Forest Regression")
print(f"Mean Squared Error: {mse_best}")
print(f"R-squared: {r_squared_best}")

Improved Random Forest Regression
Mean Squared Error: 0.5891782035318649
R-squared: 0.987875045675604


In [26]:
# Save the trained model to a file using pickle
with open('best_random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)

In [27]:
# Save the fitted MinMaxScaler to a file using pickle
with open('minmax_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [28]:
data = pd.read_csv("players_22-1.csv", low_memory=False)
data.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [29]:
cols_required = ["value_eur", 
                 "age",	
                 "potential",	
                 "wage_eur",	
                 "movement_reactions",	
                 "defending",	
                 "mentality_composure",	
                 "skill_ball_control", 
                 "overall"]

In [30]:
df = data[cols_required]

In [31]:
df.head()

Unnamed: 0,value_eur,age,potential,wage_eur,movement_reactions,defending,mentality_composure,skill_ball_control,overall
0,78000000.0,34,93,320000.0,94,34.0,96,96,93
1,119500000.0,32,92,270000.0,93,44.0,88,88,92
2,45000000.0,36,91,270000.0,94,34.0,95,88,91
3,129000000.0,29,91,270000.0,89,37.0,93,95,91
4,125500000.0,30,91,350000.0,91,64.0,89,91,91


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   value_eur            19165 non-null  float64
 1   age                  19239 non-null  int64  
 2   potential            19239 non-null  int64  
 3   wage_eur             19178 non-null  float64
 4   movement_reactions   19239 non-null  int64  
 5   defending            17107 non-null  float64
 6   mentality_composure  19239 non-null  int64  
 7   skill_ball_control   19239 non-null  int64  
 8   overall              19239 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 1.3 MB


In [33]:
# Remove null values 
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [34]:
y = df['overall']

X = df.drop(columns='overall')

In [35]:
# Load the trained Random Forest model
with open('best_random_forest_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Load the fitted MinMaxScaler
with open('minmax_scaler.pkl', 'rb') as scaler_file:
    loaded_scaler = pickle.load(scaler_file)

# Scale the new data using the loaded MinMaxScaler
X_norm = loaded_scaler.transform(X)

# Use the loaded model to make predictions
predictions = loaded_model.predict(X_norm)

# Add predictions to the new data DataFrame
df['Predicted'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Predicted'] = predictions


In [36]:
df.head()

Unnamed: 0,value_eur,age,potential,wage_eur,movement_reactions,defending,mentality_composure,skill_ball_control,overall,Predicted
0,78000000.0,34,93,320000.0,94,34.0,96,96,93,92.65
1,119500000.0,32,92,270000.0,93,44.0,88,88,92,91.72
2,45000000.0,36,91,270000.0,94,34.0,95,88,91,90.84
3,129000000.0,29,91,270000.0,89,37.0,93,95,91,90.97
4,125500000.0,30,91,350000.0,91,64.0,89,91,91,90.923333


In [37]:
df.tail()

Unnamed: 0,value_eur,age,potential,wage_eur,movement_reactions,defending,mentality_composure,skill_ball_control,overall,Predicted
19234,70000.0,22,52,1000.0,53,42.0,37,49,47,47.273333
19235,110000.0,19,59,500.0,49,41.0,47,42,47,47.296667
19236,100000.0,21,55,500.0,46,41.0,36,49,47,47.146667
19237,110000.0,19,60,500.0,48,15.0,47,45,47,47.52
19238,110000.0,19,60,500.0,54,36.0,36,38,47,47.163333
