In [47]:
import joblib
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV

In [2]:
#Step 1: Read the scoutium_attributes.csv and scoutium_potential_labels.csv files.

df_sa = pd.read_csv("scoutium_attributes.csv")
df_spl = pd.read_csv("scoutium_potential_labels.csv")

In [3]:
#Step 2: Combine the csv files we have read using the merge function. ("task_response_id", 'match_id', 'evaluator_id' "player_id" 4 pcs
# perform the concatenation over the variable.)

In [4]:
df_sa.head()

Unnamed: 0,task_response_id;match_id;evaluator_id;player_id;position_id;analysis_id;attribute_id;attribute_value
0,4915;62935;177676;1361061;2;12818495;4322;56.00
1,4915;62935;177676;1361061;2;12818495;4323;56.00
2,4915;62935;177676;1361061;2;12818495;4324;67.00
3,4915;62935;177676;1361061;2;12818495;4325;56.00
4,4915;62935;177676;1361061;2;12818495;4326;45.00


In [5]:
df_sa.columns = ["dfsa"]

In [6]:
def split_col(dataframe, column, num, col_name_list):
    dataframe = pd.DataFrame(dataframe[column].str.split(";",num).tolist(), columns=col_name_list)
    return dataframe

In [7]:
df_sa = split_col(
    df_sa,
    column="dfsa",
    num = 7,
    col_name_list=['task_response_id','match_id',"evaluator_id","player_id","position_id","analysis_id","attribute_id","attribute_value"]
    )

In [8]:
df_sa.head()

Unnamed: 0,task_response_id,match_id,evaluator_id,player_id,position_id,analysis_id,attribute_id,attribute_value
0,4915,62935,177676,1361061,2,12818495,4322,56.0
1,4915,62935,177676,1361061,2,12818495,4323,56.0
2,4915,62935,177676,1361061,2,12818495,4324,67.0
3,4915,62935,177676,1361061,2,12818495,4325,56.0
4,4915,62935,177676,1361061,2,12818495,4326,45.0


In [9]:
df_spl.head()

Unnamed: 0,task_response_id;match_id;evaluator_id;player_id;potential_label
0,4915;62935;177676;1361061;average
1,4915;62935;177676;1361626;highlighted
2,4915;62935;177676;1361858;average
3,4915;62935;177676;1362220;highlighted
4,4915;62935;177676;1364951;highlighted


In [10]:
df_spl.columns = ["dfspl"]

In [11]:
df_spl = split_col(
    df_spl,
    column="dfspl",
    num = 4,
    col_name_list=['task_response_id','match_id',"evaluator_id","player_id","potential_label"]
    )

In [12]:
df_spl.head()

Unnamed: 0,task_response_id,match_id,evaluator_id,player_id,potential_label
0,4915,62935,177676,1361061,average
1,4915,62935,177676,1361626,highlighted
2,4915,62935,177676,1361858,average
3,4915,62935,177676,1362220,highlighted
4,4915,62935,177676,1364951,highlighted


In [13]:
df = df_sa.merge(df_spl, how="inner", on=["task_response_id","match_id","evaluator_id","player_id"])

In [14]:
df.head()

Unnamed: 0,task_response_id,match_id,evaluator_id,player_id,position_id,analysis_id,attribute_id,attribute_value,potential_label
0,4915,62935,177676,1361061,2,12818495,4322,56.0,average
1,4915,62935,177676,1361061,2,12818495,4323,56.0,average
2,4915,62935,177676,1361061,2,12818495,4324,67.0,average
3,4915,62935,177676,1361061,2,12818495,4325,56.0,average
4,4915,62935,177676,1361061,2,12818495,4326,45.0,average


In [15]:
df.tail()

Unnamed: 0,task_response_id,match_id,evaluator_id,player_id,position_id,analysis_id,attribute_id,attribute_value,potential_label
10725,5642,63032,151191,1909728,7,12825756,4357,67.0,highlighted
10726,5642,63032,151191,1909728,7,12825756,4407,78.0,highlighted
10727,5642,63032,151191,1909728,7,12825756,4408,67.0,highlighted
10728,5642,63032,151191,1909728,7,12825756,4423,67.0,highlighted
10729,5642,63032,151191,1909728,7,12825756,4426,78.0,highlighted


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10730 entries, 0 to 10729
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   task_response_id  10730 non-null  object
 1   match_id          10730 non-null  object
 2   evaluator_id      10730 non-null  object
 3   player_id         10730 non-null  object
 4   position_id       10730 non-null  object
 5   analysis_id       10730 non-null  object
 6   attribute_id      10730 non-null  object
 7   attribute_value   10730 non-null  object
 8   potential_label   10730 non-null  object
dtypes: object(9)
memory usage: 838.3+ KB


In [17]:
#Step 3: Remove the Keeper (1) class in position_id from the dataset.

In [18]:
df = df.loc[df["position_id"] != "1"]

In [19]:
#Step 4: Remove the below average class in the potential label from the data set.

In [20]:
df = df.loc[df["potential_label"] != "below_average"]

In [21]:
# Step 5: Create a table from the data set you created using the "pivot_table" function. One per row in this pivot table
# Manipulate to be a player.

In [22]:
#Step 5.1: Scouts score players in “player_id”, “position_id” and “potential_label” in index, “attribute_id” in columns and values
# Create the pivot table as "attribute_value".

In [64]:
df["attribute_value"] = df["attribute_value"].astype(float)
df["position_id"] = df["position_id"].astype(float)

In [65]:
df_pt = pd.pivot_table(data=df,index=["player_id","position_id","potential_label"], columns=["attribute_id"], values="attribute_value")

In [66]:
df_pt.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,attribute_id,4322,4323,4324,4325,4326,4327,4328,4329,4330,4332,...,4352,4353,4354,4355,4356,4357,4407,4408,4423,4426
player_id,position_id,potential_label,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1355710,7.0,average,50.5,50.5,34.0,50.5,45.0,45.0,45.0,45.0,50.5,56.0,...,56.0,34.0,39.5,50.5,34.0,34.0,56.0,34.0,34.0,56.0
1356362,9.0,average,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,...,67.0,67.0,78.0,67.0,67.0,67.0,67.0,67.0,56.0,78.0
1356375,3.0,average,67.0,67.0,67.0,67.0,67.0,67.0,67.0,78.0,67.0,67.0,...,67.0,67.0,78.0,67.0,67.0,67.0,67.0,67.0,56.0,78.0
1356375,4.0,average,67.0,78.0,67.0,67.0,67.0,78.0,78.0,78.0,56.0,67.0,...,78.0,67.0,67.0,67.0,56.0,67.0,56.0,67.0,45.0,56.0
1356411,9.0,average,67.0,67.0,78.0,78.0,67.0,67.0,67.0,67.0,89.0,78.0,...,67.0,56.0,67.0,67.0,56.0,67.0,89.0,56.0,67.0,78.0


In [26]:
#Step 5.2: Using the "reset_index" function, assign the indexes as variables and convert the names of the "attribute_id" columns to strings.

In [67]:
df_pt = df_pt.reset_index()

In [68]:
df_pt.columns = [str(col) for col in df_pt.columns]

In [29]:
# Step 6: Express the “potential_label” categories (average, highlighted) numerically using the Label Encoder function.

In [69]:
def label_encoder(dataframe, app_label_encode):
    labelencoder = LabelEncoder()
    dataframe[app_label_encode] = labelencoder.fit_transform(dataframe[app_label_encode])
    return dataframe

In [70]:
label_encoder(df_pt, "potential_label")

Unnamed: 0,player_id,position_id,potential_label,4322,4323,4324,4325,4326,4327,4328,...,4352,4353,4354,4355,4356,4357,4407,4408,4423,4426
0,1355710,7.0,0,50.5,50.5,34.0,50.5,45.0,45.0,45.0,...,56.0,34.0,39.5,50.5,34.0,34.0,56.0,34.0,34.0,56.0
1,1356362,9.0,0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,...,67.0,67.0,78.0,67.0,67.0,67.0,67.0,67.0,56.0,78.0
2,1356375,3.0,0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,...,67.0,67.0,78.0,67.0,67.0,67.0,67.0,67.0,56.0,78.0
3,1356375,4.0,0,67.0,78.0,67.0,67.0,67.0,78.0,78.0,...,78.0,67.0,67.0,67.0,56.0,67.0,56.0,67.0,45.0,56.0
4,1356411,9.0,0,67.0,67.0,78.0,78.0,67.0,67.0,67.0,...,67.0,56.0,67.0,67.0,56.0,67.0,89.0,56.0,67.0,78.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,1907347,5.0,0,67.0,78.0,78.0,56.0,78.0,67.0,78.0,...,67.0,67.0,67.0,67.0,56.0,67.0,78.0,78.0,78.0,56.0
267,1907347,6.0,0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,...,67.0,67.0,78.0,67.0,67.0,67.0,67.0,67.0,67.0,78.0
268,1909532,10.0,0,56.0,45.0,45.0,45.0,56.0,56.0,56.0,...,45.0,45.0,56.0,34.0,34.0,56.0,34.0,34.0,34.0,23.0
269,1909724,2.0,1,78.0,78.0,78.0,78.0,78.0,78.0,78.0,...,78.0,78.0,78.0,67.0,67.0,67.0,78.0,78.0,67.0,78.0


In [71]:
# Step 7: Assign the numeric variable columns to a list with the name “num_cols”.
num_cols = [col for col in df_pt.columns if df_pt[col].dtypes!="O"]

In [72]:
num_cols.remove("potential_label")

In [73]:
num_cols

['position_id',
 '4322',
 '4323',
 '4324',
 '4325',
 '4326',
 '4327',
 '4328',
 '4329',
 '4330',
 '4332',
 '4333',
 '4335',
 '4338',
 '4339',
 '4340',
 '4341',
 '4342',
 '4343',
 '4344',
 '4345',
 '4348',
 '4349',
 '4350',
 '4351',
 '4352',
 '4353',
 '4354',
 '4355',
 '4356',
 '4357',
 '4407',
 '4408',
 '4423',
 '4426']

In [74]:
# Step 8: Apply StandardScaler to scale the data in all num_cols variables you have saved.
for i in num_cols:
    df_pt[i] = StandardScaler().fit_transform(df_pt[[i]])

In [75]:
df_pt.head()

Unnamed: 0,player_id,position_id,potential_label,4322,4323,4324,4325,4326,4327,4328,...,4352,4353,4354,4355,4356,4357,4407,4408,4423,4426
0,1355710,0.51129,0,-0.542606,-0.559398,-1.404764,-0.437916,-0.766602,-0.794802,-0.907351,...,0.012848,-1.281899,-1.132245,-0.494569,-1.234869,-1.519578,-0.14265,-1.487256,-0.955381,-0.252741
1,1356362,1.256234,0,0.595327,0.560776,0.678677,0.682837,0.722639,0.722929,0.60119,...,0.786581,0.813645,1.052769,0.631718,0.914913,0.768231,0.529664,0.668713,0.403717,1.041611
2,1356375,-0.978598,0,0.595327,0.560776,0.678677,0.682837,0.722639,0.722929,0.60119,...,0.786581,0.813645,1.052769,0.631718,0.914913,0.768231,0.529664,0.668713,0.403717,1.041611
3,1356375,-0.606126,0,0.595327,1.30756,0.678677,0.682837,0.722639,1.481795,1.355461,...,1.560314,0.813645,0.428479,0.631718,0.198319,0.768231,-0.14265,0.668713,-0.275832,-0.252741
4,1356411,1.256234,0,0.595327,0.560776,1.373158,1.430006,0.722639,0.722929,0.60119,...,0.786581,0.11513,0.428479,0.631718,0.198319,0.768231,1.874292,-0.049944,1.083267,1.041611


In [76]:
df_pt.isnull().sum()

player_id          0
position_id        0
potential_label    0
4322               0
4323               0
4324               0
4325               0
4326               0
4327               0
4328               0
4329               0
4330               0
4332               0
4333               0
4335               0
4338               0
4339               0
4340               0
4341               0
4342               0
4343               0
4344               0
4345               0
4348               0
4349               0
4350               0
4351               0
4352               0
4353               0
4354               0
4355               0
4356               0
4357               0
4407               0
4408               0
4423               0
4426               0
dtype: int64

In [38]:
# Step9: A machine learning model that predicts potential tags of football players with minimum error from the data set we have
# improve. (Print the roc_auc, f1, precision, recall, accuracy metrics.)

In [77]:
y = df_pt["potential_label"]
X = df_pt.drop(["potential_label","player_id"], axis=1)


In [62]:
df_pt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   player_id        271 non-null    object 
 1   position_id      271 non-null    object 
 2   potential_label  271 non-null    int32  
 3   4322             271 non-null    float64
 4   4323             271 non-null    float64
 5   4324             271 non-null    float64
 6   4325             271 non-null    float64
 7   4326             271 non-null    float64
 8   4327             271 non-null    float64
 9   4328             271 non-null    float64
 10  4329             271 non-null    float64
 11  4330             271 non-null    float64
 12  4332             271 non-null    float64
 13  4333             271 non-null    float64
 14  4335             271 non-null    float64
 15  4338             271 non-null    float64
 16  4339             271 non-null    float64
 17  4340            

In [78]:

def base_models(X, y, scoring="roc_auc"):
    print("Base Models....")
    classifiers = [('LR', LogisticRegression()),
                   ('KNN', KNeighborsClassifier()),
                   ("SVC", SVC()),
                   ("CART", DecisionTreeClassifier()),
                   ("RF", RandomForestClassifier()),
                   ('Adaboost', AdaBoostClassifier()),
                   ('GBM', GradientBoostingClassifier()),
                   ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
                   ('LightGBM', LGBMClassifier()),
                   # ('CatBoost', CatBoostClassifier(verbose=False))
                   ]

    for name, classifier in classifiers:
        cv_results = cross_validate(classifier, X, y, cv=3, scoring=scoring)
        print(f"{scoring}: {round(cv_results['test_score'].mean(), 4)} ({name}) ")

In [79]:
base_models(X, y, scoring="accuracy")


Base Models....
accuracy: 0.8523 (LR) 
accuracy: 0.8597 (KNN) 
accuracy: 0.845 (SVC) 
accuracy: 0.812 (CART) 




accuracy: 0.8781 (RF) 
accuracy: 0.834 (Adaboost) 
accuracy: 0.8561 (GBM) 
accuracy: 0.8488 (XGBoost) 




accuracy: 0.845 (LightGBM) 


In [80]:

rf_model = RandomForestClassifier()

In [81]:
import numpy as np
rf_random_params = {"max_depth": np.random.randint(5, 50, 10),
                    "max_features": [3, 5, 7, "auto", "sqrt"],
                    "min_samples_split": np.random.randint(2, 50, 20),
                    "n_estimators": [int(x) for x in np.linspace(start=200, stop=1500, num=10)]}

In [82]:
rf_random = RandomizedSearchCV(estimator=rf_model,
                               param_distributions=rf_random_params,
                               n_iter=100,  # denenecek parametre sayısı
                               cv=3,
                               verbose=True,
                               random_state=42,
                               n_jobs=-1)

In [83]:
rf_random.fit(X, y)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': array([ 9, 10, 21, 16, 26, 46, 17, 32, 41, 10]),
                                        'max_features': [3, 5, 7, 'auto',
                                                         'sqrt'],
                                        'min_samples_split': array([30, 42, 23, 28, 35, 19, 30, 11,  6, 14, 37, 36, 46, 48, 37, 30, 23,
       11, 20,  4]),
                                        'n_estimators': [200, 344, 488, 633,
                                                         777, 922, 1066, 1211,
                                                         1355, 1500]},
                   random_state=42, verbose=True)

In [84]:
rf_random.best_params_




{'n_estimators': 1500,
 'min_samples_split': 30,
 'max_features': 'sqrt',
 'max_depth': 32}

In [85]:
rf_random_final = rf_model.set_params(**rf_random.best_params_).fit(X, y)



In [86]:
cv_results = cross_validate(rf_random_final, X, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])

In [87]:
cv_results['test_accuracy'].mean()


0.8781144781144782

In [88]:
cv_results['test_f1'].mean()


0.5842366712571316

In [89]:
cv_results['test_roc_auc'].mean()

0.9009161381254405