# Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Data Loading and Initial Exploration

## Load Data

In [None]:
df = pd.read_csv('train.csv')

## Preview Data

In [None]:
df.head()

## Basic Information

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

# Exploratory Data Analysis (EDA)

## Correlation Analysis

In [None]:
# https://seaborn.pydata.org/examples/many_pairwise_correlations.html
corr = df.corr(numeric_only=True)
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(15, 13))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
_ = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

## Data Cleaning

In [None]:
df.select_dtypes(include='object').head()

### Caps / Goals Column Seperation

In [None]:
df['Caps / Goals'].fillna('0 / 0', inplace=True)

In [None]:
df['Caps'] = df['Caps / Goals'].str.split('/').apply(lambda x: int(x[0]))

In [None]:
df['Goals'] = df['Caps / Goals'].str.split('/').apply(lambda x: int(x[1]))

In [None]:
del df['Caps / Goals']

### Height and Weight Columns' Conversion to Integer

In [None]:
df['Height'] = df['Height'].replace('CM', '', regex=True).astype(int)

In [None]:
df['Weight'] = df['Weight'].replace('KG', '', regex=True).astype(int)

### Foot Column is Made Numeric

In [None]:
df['Foot'].value_counts()

In [None]:
df['Foot'] = df['Foot'].map({'Left': 0, 'Right': 1})

### Position Column is OneHotEncoded

In [None]:
positions = []

In [None]:
for x in df['Positions'].str.split().values:
    positions.extend(x)

In [None]:
unique_positions = list(set(positions))

In [None]:
unique_positions

In [None]:
position_matrix = []

In [None]:
for row in df.iterrows():
    row_positions = row[1]['Positions'].split()
    dict_positions = {x:0 for x in unique_positions}
    for position in row_positions:
        dict_positions[position] += 1
        position_matrix.append({row[1]['id']: dict_positions})

In [None]:
flattened_data = []

for item in position_matrix:
    for id_value, attributes in item.items():
        row = {'id': id_value}
        row.update(attributes)
        flattened_data.append(row)

positions_df = pd.DataFrame(flattened_data)

In [None]:
positions_df = positions_df.drop_duplicates()

In [None]:
df = df.merge(positions_df, on='id', how='left')

In [None]:
df = df.drop(columns=['Positions'])

### Filling Null Values with 0

In [None]:
df = df.fillna(0)

In [None]:
df.head()

In [None]:
df.info()

# Model Development

## Undersampling

In [None]:
df.value_increased.value_counts(normalize=True)

### Train and Test Split

In [None]:
X = df.drop(columns = ['id', 'value_increased'])
y = df['value_increased'].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,stratify=y, test_size=0.3, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

## XGBoost

In [None]:
import xgboost as xgb

# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(tree_method="hist", scale_pos_weight=9, early_stopping_rounds = 2,)
# Fit the model, test sets are used for early stopping.
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
# Save model into JSON format.
clf.save_model("clf.json")

## Classification Report

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
y_pred

## Preprocess Pipeline Preparation

In [None]:
def prep_pipe(data):
    df = data.copy()
    df['Caps / Goals'].fillna('0 / 0', inplace=True)
    df['Caps'] = df['Caps / Goals'].str.split('/').apply(lambda x: int(x[0]))
    df['Goals'] = df['Caps / Goals'].str.split('/').apply(lambda x: int(x[1]))
    del df['Caps / Goals']
    df['Height'] = df['Height'].replace('CM', '', regex=True).astype(int)
    df['Weight'] = df['Weight'].replace('KG', '', regex=True).astype(int)
    df['Foot'] = df['Foot'].map({'Left': 0, 'Right': 1})
    
    position_matrix = []
    for row in df.iterrows():
        row_positions = row[1]['Positions'].split()
        dict_positions = {x:0 for x in unique_positions}
        for position in row_positions:
            dict_positions[position] += 1
            position_matrix.append({row[1]['id']: dict_positions})
   
    flattened_data = []

    for item in position_matrix:
        for id_value, attributes in item.items():
            row = {'id': id_value}
            row.update(attributes)
            flattened_data.append(row)

    positions_df = pd.DataFrame(flattened_data)
    
    positions_df = positions_df.drop_duplicates()

    df = df.merge(positions_df, on='id', how='left')

    df = df.drop(columns=['Positions', 'id'])    
    
    df = df.fillna(0)

    return df

## Test Datasının Hazırlanması

In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
test_df.head()

In [None]:
test_df = prep_pipe(test_df)

In [None]:
## PyCaret

from pycaret.classification import ClassificationExperiment
data = df.drop(columns=['id'])
s = ClassificationExperiment()
s.setup(data, target = 'value_increased',session_id = 123)

In [None]:
best = s.compare_models(n_select=16,sort='f1')

In [None]:
best = best[1]

## Model Tuning

In [None]:
tuned_model = s.tune_model(best,optimize='f1')

## Model Finalization

In [None]:
final_model = s.finalize_model(best)