This script performs data preprocessing, following these steps:
1. drop irrelevant columns (e.g. "id", "url"...)
2. handle null values
3. feature engineering
4. categorical encoding
5. nonlinear transformations
6. feature standardisation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load the train and test datasets (not preprocessed yet)
# insert the correct paths to the datasets!
raw_train_path = '../../data/train.csv'
raw_test_path = '../../data/test.csv'

train = pd.read_csv(f"{raw_train_path}", index_col=0)
test = pd.read_csv(f'{raw_test_path}', index_col=0)

In [3]:
# define the output path (the folder where you want to save the preprocessed data)
preprocessed_folder = "../../data/preprocessed/"

0. Handle column names

In [4]:
# Standardize column names (lowercase, snake_case)

def clean_cols(df):
    df.columns = (df.columns.str.strip().str.lower().str.replace(r"[ \-]+", "_", regex=True))
    return df

train = clean_cols(train)
test = clean_cols(test)

1. Drop irrelevant columns

In [5]:
# Drop columns that are not needed for the analysis
columns_to_drop = [
    "id",
    "sofifa_id",
    "short_name",
    "long_name",
    "player_url",
    "player_face_url",
    "club_logo_url",
    "nation_flag_url",
    "team_jersey_number",
    "nation_jersey_number",
    "club_jersey_number",
    "club_loaned_from",
    "real_face",
    "player_tags",
    "player_traits",
    "club_team_id",
    "nationality_id",
    "dob"
]
train.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test.drop(columns=columns_to_drop, inplace=True, errors='ignore')

2. Handle null values:
- drop all obaservations for which the label is None
- drop columns with >90% null values
- numeric: set to 0 and add a flag for all other null values (informative nulls)
- categorical: set to "Unknown"

We are using missing flags because for variables with >5% null rate, missing is not due to measurement errors, but is structural (= informative). It is important especially for tree-based models which can create splits based on the missing flags.

In [6]:
# Drop rows with missing target & log-transform target once
train.dropna(subset=["value_eur"], inplace=True)
train["log_value_eur"] = np.log1p(train["value_eur"])

In [7]:
# see columns with the largest proportion of missing values
pd.set_option('display.max_rows', None)
missing_values = train.isnull().mean().sort_values(ascending=False)
print(missing_values)
pd.reset_option('display.max_rows')

nation_position                0.963738
goalkeeping_speed              0.887628
physic                         0.112372
defending                      0.112372
pace                           0.112372
shooting                       0.112372
passing                        0.112372
dribbling                      0.112372
release_clause_eur             0.057132
club_joined                    0.057132
player_positions               0.000000
power_strength                 0.000000
power_jumping                  0.000000
power_shot_power               0.000000
movement_balance               0.000000
movement_reactions             0.000000
movement_agility               0.000000
movement_sprint_speed          0.000000
movement_acceleration          0.000000
power_stamina                  0.000000
mentality_aggression           0.000000
power_long_shots               0.000000
skill_long_passing             0.000000
mentality_interceptions        0.000000
mentality_positioning          0.000000


In [8]:
# show lines for which shooting is null and position is not "GK"
print(train[train["shooting"].isnull() & (train["player_positions"] != "GK")].head(10))

# so all the players with null shooting, defending, pace ... are goalkeepers --> informative nulls

Empty DataFrame
Columns: [player_positions, overall, potential, value_eur, wage_eur, age, height_cm, weight_kg, club_name, league_name, league_level, club_position, club_joined, club_contract_valid_until, nationality_name, nation_position, preferred_foot, weak_foot, skill_moves, international_reputation, work_rate, body_type, release_clause_eur, pace, shooting, passing, dribbling, defending, physic, attacking_crossing, attacking_finishing, attacking_heading_accuracy, attacking_short_passing, attacking_volleys, skill_dribbling, skill_curve, skill_fk_accuracy, skill_long_passing, skill_ball_control, movement_acceleration, movement_sprint_speed, movement_agility, movement_reactions, movement_balance, power_shot_power, power_jumping, power_stamina, power_strength, power_long_shots, mentality_aggression, mentality_interceptions, mentality_positioning, mentality_vision, mentality_penalties, mentality_composure, defending_marking_awareness, defending_standing_tackle, defending_sliding_tackle,

In [9]:
# drop columns with too many missing values
new_columns_to_drop = ["nation_position"]
train.drop(columns=new_columns_to_drop, inplace=True, errors='ignore')
test.drop(columns=new_columns_to_drop, inplace=True, errors='ignore')

In [10]:
# handle missing values

def handle_missing(train, test):
    train = train.copy()
    test  = test.copy()

    # Numeric: any missing → flag + fill 0
    num_cols = train.select_dtypes(include=["int64","float64"]).columns
    num_missing = {c: 0 for c in num_cols if train[c].isnull().any()}
    flag_cols = {c: train[c].isnull().astype(int) for c in num_cols if train[c].isnull().any()}

    train.fillna(num_missing, inplace=True)
    test.fillna(num_missing, inplace=True)

    for col_name, data in flag_cols.items():
        train[f'{col_name}_missing_flag'] = data
        if col_name in test.columns:
            test[f'{col_name}_missing_flag'] = test[col_name].isnull().astype(int)

    # Categorical: fill with 'Unknown'
    cat_cols = train.select_dtypes(include="object").columns.intersection(test.columns)
    cat_missing = {c: 'Unknown' for c in cat_cols if train[c].isnull().any()}

    train.fillna(cat_missing, inplace=True)
    test.fillna(cat_missing, inplace=True)

    return train, test

train, test = handle_missing(train, test)

3. Feature Engineering
Added these features:
- position_group: "Attacker", "Midfielder", "Defender", "Goalkeeper"
- body mass index (bmi): weight / (height/100)**2
- contract_remaining
- years_at_club
- priority: 1 if the player is a reserve, 2 if substitute, 3 if in the starting team
- attack_work_rate and defense_work_rate: 1 for low, 2 for medium, 3 for high

In [11]:
# Feature engineering (position, BMI, contract, years at club)
def categorize_position(pos):
    if pd.isna(pos): return "Unknown"
    pos = pos.lower()
    if "gk" in pos:   return "Goalkeeper"
    if any(p in pos for p in ["cb","rb","lb","rwb","lwb"]): return "Defender"
    if any(p in pos for p in ["cm","cdm","cam","rm","lm"]):    return "Midfielder"
    if any(p in pos for p in ["st","cf","rw","lw"]):          return "Attacker"
    return "Unknown"

for df in (train, test):
    df["position_group"] = df["player_positions"].apply(categorize_position)
    df["bmi"] = df["weight_kg"] / (df["height_cm"]/100)**2

if "club_contract_valid_until" in train.columns:
    min_year = train["club_contract_valid_until"].min()
    train["contract_remaining"] = train["club_contract_valid_until"] - min_year
    test ["contract_remaining"] = test["club_contract_valid_until"]  - min_year

if "club_joined" in train.columns:
    for df in (train, test):
        df["joined_date"] = pd.to_datetime(df["club_joined"], errors="coerce")
    ref = pd.Timestamp("2022-01-01")
    train["years_at_club"] = (ref - train["joined_date"]).dt.days/365.25
    test ["years_at_club"] = (ref -  test["joined_date"]).dt.days/365.25
    train["years_at_club"].fillna(0, inplace=True)
    test ["years_at_club"].fillna(0, inplace=True)

# add an ordinal variable that says whether the player is starting or not and whether he is a sub
def categorize_priority(pos):
    pos = str(pos).lower()
    if "res" in pos:
        return 1
    elif "sub" in pos:
        return 2
    else:
        return 3

train["priority"] = train["club_position"].apply(categorize_priority)
test["priority"] = test["club_position"].apply(categorize_priority)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["years_at_club"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test ["years_at_club"].fillna(0, inplace=True)


In [12]:
# print frequency table of the variable "priority"
print(train["priority"].value_counts())

priority
2    6633
3    6165
1    2535
Name: count, dtype: int64


In [13]:
# Drop intermediate columns not needed anymore
train.drop(columns=["player_positions","club_contract_valid_until","club_joined","joined_date", "club_position"], inplace=True, errors='ignore')
test.drop(columns=["player_positions","club_contract_valid_until","club_joined","joined_date", "club_position"], inplace=True, errors='ignore')

4. Categorical encoding
- integer encoding for categorical features with an ordinal interpretation (e.g. work_rate or priority as discussed above)
- one-hot encoding for low cardinality features (<15 classes)
- cross-validation target encoding for high-cardinality features

In [14]:
# print all categorical columns, along with its cardinality and unique values
for col in train.select_dtypes(include=["object"]).columns:
    print(f"{col}: {train[col].nunique()} unique values")
    print(train[col].unique())
    print()

club_name: 701 unique values
['SV Zulte Waregem' 'Huddersfield Town' 'Middlesbrough' 'Gangwon FC'
 'Ross County FC' 'SC Braga' 'Demir Grup Sivasspor' 'Sunderland'
 'Houston Dynamo' 'Al Nassr' 'Nashville SC' 'Fulham' 'Portsmouth'
 'Sevilla FC' 'Göztepe SK' 'Sagan Tosu' 'Unión Deportiva Las Palmas'
 'Club Atlético Aldosivi' 'IFK Göteborg' 'FC Nordsjælland' 'Sporting CP'
 '12 de Octubre FC' 'Sporting Kansas City' 'SK Sturm Graz'
 'BSC Young Boys' 'FC Viktoria Plzeň' 'Djurgårdens IF' 'OGC Nice'
 'Atlético Nacional' 'CS Mioveni' 'Yeni Malatyaspor' 'Livingston FC'
 'Hajduk Split' 'Getafe CF' 'RSC Anderlecht' 'SD Eibar'
 'RCD Espanyol de Barcelona' 'CD Leganés' 'Cambridge United'
 'Burton Albion' 'Carlos A. Mannucci' 'Santos' 'TSG Hoffenheim' 'Damac FC'
 'Rosario Central' 'FC Barcelona' 'Boca Juniors' 'Gamba Osaka'
 'Sport Club Corinthians Paulista' 'Gimnasia y Esgrima La Plata'
 'Radomiak Radom' 'LOSC Lille' 'Warta Poznań' 'Leyton Orient' 'Al Raed'
 'Napoli' 'Mansfield Town' 'Górnik Łęczna' 

In [15]:
# define two separate mappings for the variable "work_rate", the first one is defense_wr, the second one is attack_wr.
def split_and_encode_work_rate(df):
    # Define the ordinal mapping
    rate_map = {'Low': 0, 'Medium': 1, 'High': 2}

    # Split the work_rate column
    work_split = df['work_rate'].str.split('/', expand=True)
    df['defense_work_rate'] = work_split[0].map(rate_map)
    df['attack_work_rate'] = work_split[1].map(rate_map)

    df.drop(columns=['work_rate'], inplace=True)

    return df

train = split_and_encode_work_rate(train)
test = split_and_encode_work_rate(test)

train.head()

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_name,league_name,league_level,...,defending_missing_flag,physic_missing_flag,goalkeeping_speed_missing_flag,position_group,bmi,contract_remaining,years_at_club,priority,defense_work_rate,attack_work_rate
0,58,76,525000.0,952.712115,18,180,70,SV Zulte Waregem,Belgian Jupiler Pro League,1.0,...,0,0,1,Attacker,21.604938,1.0,1.333333,2,1,1
1,55,75,350000.0,902.232819,19,179,68,Huddersfield Town,English League Championship,2.0,...,0,0,1,Midfielder,21.222808,3.0,2.42026,1,2,1
2,53,66,230000.0,2873.818374,21,180,65,Middlesbrough,English League Championship,2.0,...,0,0,1,Midfielder,20.061728,1.0,3.504449,1,1,1
3,60,71,475000.0,678.608124,21,185,86,Gangwon FC,Korean K League 1,1.0,...,1,1,0,Goalkeeper,25.127831,1.0,2.992471,2,1,1
4,63,65,475000.0,1911.674228,28,185,70,Ross County FC,Scottish Premiership,1.0,...,0,0,1,Defender,20.452885,2.0,0.353183,2,1,1


In [16]:
# encoding for categorical features
def encode_categoricals(train, test, target_col, low_card_thresh=15, n_splits=5, random_state=42):
    """
    Encodes categorical features in train and test sets.
    - One-hot encodes low-cardinality columns
    - Cross-validated target encodes high-cardinality columns

    Returns:
        train_encoded, test_encoded
    """
    train = train.copy()
    test = test.copy()

    cat_cols = train.select_dtypes(include='object').columns
    low_card = [col for col in cat_cols if train[col].nunique() <= low_card_thresh]
    high_card = [col for col in cat_cols if train[col].nunique() > low_card_thresh]

    # --- One-hot encoding (low cardinality)
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    train_ohe = pd.DataFrame(ohe.fit_transform(train[low_card]),
                             columns=ohe.get_feature_names_out(low_card),
                             index=train.index)
    test_ohe = pd.DataFrame(ohe.transform(test[low_card]),
                             columns=ohe.get_feature_names_out(low_card),
                             index=test.index)

    # --- CV Target encoding (high cardinality)
    global_mean = train[target_col].mean()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for col in high_card:
        col_encoded = pd.Series(index=train.index, dtype=float)

        for train_idx, val_idx in kf.split(train):
            fold_train, fold_val = train.iloc[train_idx], train.iloc[val_idx]
            means = fold_train.groupby(col)[target_col].mean()
            encoded_vals = fold_val[col].map(means).fillna(global_mean)
            col_encoded.iloc[val_idx] = encoded_vals

        train[col + "_te"] = col_encoded
        full_means = train.groupby(col)[target_col].mean()
        test[col + "_te"] = test[col].map(full_means).fillna(global_mean)

    # Drop original categorical columns
    train.drop(columns=cat_cols, inplace=True)
    test.drop(columns=cat_cols, inplace=True)

    # Combine with encoded features
    train_encoded = pd.concat([train, train_ohe], axis=1)
    test_encoded = pd.concat([test, test_ohe], axis=1)

    return train_encoded, test_encoded


train_encoded, test_encoded = encode_categoricals(train, test, target_col="value_eur")

In [17]:
# Display all columns in the DataFrame
pd.set_option('display.max_columns', None)       # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping
pd.set_option('display.max_colwidth', None)      # Show full content in each column (if it's a string)

train_encoded.head()

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,log_value_eur,release_clause_eur_missing_flag,pace_missing_flag,shooting_missing_flag,passing_missing_flag,dribbling_missing_flag,defending_missing_flag,physic_missing_flag,goalkeeping_speed_missing_flag,bmi,contract_remaining,years_at_club,priority,defense_work_rate,attack_work_rate,club_name_te,league_name_te,nationality_name_te,preferred_foot_Left,preferred_foot_Right,body_type_Lean (170-),body_type_Lean (170-185),body_type_Lean (185+),body_type_Normal (170-),body_type_Normal (170-185),body_type_Normal (185+),body_type_Stocky (170-),body_type_Stocky (170-185),body_type_Stocky (185+),body_type_Unique,position_group_Attacker,position_group_Defender,position_group_Goalkeeper,position_group_Midfielder
0,58,76,525000.0,952.712115,18,180,70,1.0,3,2,1,1100000.0,79.0,58.0,46.0,63.0,20.0,62.0,46,62,54,53,51,64,43,44,31,63,78,80,74,39,68,56,73,59,68,53,46,15,48,47,58,43,13,19,17,7,6,8,10,7,0.0,13.171155,0,0,0,0,0,0,0,1,21.604938,1.0,1.333333,2,1,1,1606250.0,2145450.0,4682778.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,55,75,350000.0,902.232819,19,179,68,2.0,2,2,1,998000.0,66.0,38.0,53.0,55.0,45.0,51.0,37,34,51,64,36,53,33,38,58,56,67,65,59,54,61,42,50,65,45,39,51,43,53,56,46,48,40,48,45,14,8,5,14,11,0.0,12.765691,0,0,0,0,0,0,0,1,21.222808,3.0,2.42026,1,2,1,1064583.0,1772780.0,2738451.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,53,66,230000.0,2873.818374,21,180,65,2.0,3,2,1,581000.0,65.0,51.0,52.0,55.0,26.0,41.0,57,50,39,57,45,54,43,37,46,56,67,63,57,42,71,59,45,40,45,44,31,23,48,51,55,52,26,23,31,14,8,13,9,6,0.0,12.345839,0,0,0,0,0,0,0,1,20.061728,1.0,3.504449,1,1,1,1557647.0,1677065.0,2742315.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,60,71,475000.0,678.608124,21,185,86,1.0,2,1,1,689000.0,0.0,0.0,0.0,0.0,0.0,0.0,10,8,11,25,7,11,12,13,24,10,20,28,37,49,43,41,57,26,64,8,27,12,6,33,14,26,8,14,12,60,60,54,63,61,23.0,13.071072,0,1,1,1,1,1,1,0,25.127831,1.0,2.992471,2,1,1,567352.9,755292.8,1308178.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,63,65,475000.0,1911.674228,28,185,70,1.0,4,2,1,867000.0,63.0,40.0,51.0,55.0,62.0,71.0,47,42,61,59,27,52,39,21,57,54,61,64,63,60,62,46,75,72,75,33,61,61,40,49,32,67,62,63,62,11,12,6,11,14,0.0,13.071072,0,0,0,0,0,0,0,1,20.452885,2.0,0.353183,2,1,1,593333.3,1309190.0,2738451.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


5. Nonlinear transformations
- added log transformation of: "wage_eur", "release_clause_eur", "overall", "international_reputation", "potential", given their highly nonlinear relationship with the target observed in the EDA
- added square for "age", given the typical quadratic relationship between age and wage (so we supposed value for players), observed in many domains

In [18]:
# Log-transform & align
def transform_and_align(train_df, test_df, log_transform_cols, sq_transform_cols):
    t = train_df.copy()
    s = test_df.copy()

    # single log1p pass
    for c in log_transform_cols:
        if c in t.columns:
            t[f"log_{c}"] = np.log1p(t[c])
            s[f"log_{c}"] = np.log1p(s[c])
    
    # single square pass
    for c in sq_transform_cols:
        if c in t.columns:
            t[f"sq_{c}"] = np.square(t[c])
            s[f"sq_{c}"] = np.square(s[c])

    # ensure exact column match
    t_aligned, s_aligned = t.align(s, join="inner", axis=1)
    return t_aligned, s_aligned

log_cols = ["wage_eur", "release_clause_eur", "overall", "international_reputation", "potential"]
squared_cols = ['age']

X_train, X_test = transform_and_align(train_encoded, test_encoded, log_cols, squared_cols)

6. Feature standardisation

In [19]:
# Scaling
num_feats = (X_train.select_dtypes(include=["float64","int64"])
            .columns
            .difference(["value_eur","log_value_eur"]))

scaler = StandardScaler()
X_train[num_feats] = scaler.fit_transform(X_train[num_feats])
X_test [num_feats] = scaler.transform(X_test[num_feats])

In [20]:
X_test.head()

Unnamed: 0,overall,potential,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,release_clause_eur_missing_flag,pace_missing_flag,shooting_missing_flag,passing_missing_flag,dribbling_missing_flag,defending_missing_flag,physic_missing_flag,goalkeeping_speed_missing_flag,bmi,contract_remaining,years_at_club,priority,defense_work_rate,attack_work_rate,club_name_te,league_name_te,nationality_name_te,preferred_foot_Left,preferred_foot_Right,body_type_Lean (170-),body_type_Lean (170-185),body_type_Lean (185+),body_type_Normal (170-),body_type_Normal (170-185),body_type_Normal (185+),body_type_Stocky (170-),body_type_Stocky (170-185),body_type_Stocky (185+),body_type_Unique,position_group_Attacker,position_group_Defender,position_group_Goalkeeper,position_group_Midfielder,log_wage_eur,log_release_clause_eur,log_overall,log_international_reputation,log_potential,sq_age
0,-0.399837,0.313192,-0.360151,-0.885229,-0.19532,-0.707795,2.222693,0.084566,-0.452831,-0.255067,-0.205446,0.313836,0.260414,0.498294,0.390931,0.634876,0.242034,0.142349,0.209723,-0.09931,0.495498,0.313285,0.341603,0.322595,0.106758,0.732449,0.33643,0.42423,0.09096,0.846831,-0.15741,0.418884,-0.21008,-0.81375,0.432548,-0.631852,0.483598,1.027436,0.749687,0.292463,0.66641,0.012232,0.256752,0.81733,0.615566,0.588403,-0.253469,-0.370439,-0.308272,-0.1352,-0.199549,-0.339543,-0.246157,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-2.810518,-0.851748,-0.629817,1.700635,-1.729491,-0.447309,-0.180667,-0.416749,-0.59257,0.532647,-0.556835,0.556835,-0.158336,1.933079,-0.336188,-0.189588,-0.733952,-0.529545,-0.069165,-0.169252,-0.139818,-0.08578,-0.410591,-0.76387,-0.355806,1.291331,-0.349134,0.263575,-0.352854,-0.266567,0.350558,-0.867153
1,-1.270429,-0.831311,-0.437997,-0.674002,0.534631,0.565339,-0.471848,-1.405845,-0.452831,-0.255067,-0.294053,-0.273081,-0.257965,-0.186921,-0.390646,0.411049,0.197551,-0.576297,0.007945,-0.09931,-0.190262,-0.418988,-0.824851,-0.609748,-0.417037,-0.262493,-0.379347,-0.563388,-0.843775,-0.698019,-0.15741,-0.136993,-1.041852,-0.316821,-0.678591,-0.002979,-0.953599,0.617038,0.653177,-0.16371,-0.867209,-0.430403,-1.546067,0.37271,0.380683,0.39621,-0.08356,-0.311334,-0.368402,-0.485147,-0.533431,-0.339543,-0.246157,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-2.810518,0.216912,0.195707,-0.523896,-0.331069,-0.447309,-0.180667,-0.44869,-0.538179,-0.814476,-0.556835,0.556835,-0.158336,-0.51731,-0.336188,-0.189588,-0.733952,1.888415,-0.069165,-0.169252,-0.139818,-0.08578,-0.410591,1.309124,-0.355806,-0.774395,-1.365828,0.001547,-1.29752,-0.266567,-0.817002,-0.694521
2,-1.995923,0.640193,-0.438558,-1.941363,-0.487301,-0.141958,-0.471848,-1.405845,-0.452831,-0.255067,-0.308624,0.816909,0.260414,-0.284809,-0.298696,-0.931914,-0.380732,-0.189334,0.209723,-0.789106,-0.601718,0.369614,-0.400686,-0.445217,-0.649835,-0.992118,-0.737236,1.016801,1.025695,0.040823,-3.027416,-0.345447,0.394846,-1.724786,-2.036649,-0.081588,0.072971,-1.141814,-1.132256,-0.011653,-0.575091,0.138699,-1.054389,-1.208163,-1.169541,-0.853046,-0.310105,-0.193125,-0.608924,-0.543471,-0.199549,-0.339543,-0.246157,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-2.810518,0.414425,-1.45534,-0.487406,-1.729491,-0.447309,-0.180667,-0.312991,-0.334375,0.689248,-0.556835,0.556835,-0.158336,-0.51731,-0.336188,-0.189588,1.362487,-0.529545,-0.069165,-0.169252,-0.139818,-0.08578,2.435516,-0.76387,-0.355806,-0.774395,-1.381836,-0.080858,-2.162642,-0.266567,0.663891,-1.609873
3,-0.399837,-0.013809,-0.412963,-0.885229,-0.19532,0.42388,0.875423,0.084566,0.850823,-0.255067,-0.205446,0.607295,0.590292,0.596182,0.390931,-0.842383,-0.113832,0.971556,0.865501,-0.09931,0.358346,0.5386,0.553686,0.925876,0.106758,0.533461,0.33643,0.687595,0.691861,0.242325,-1.150874,-0.067509,0.84854,-2.718644,-0.123021,-0.081588,-0.337657,-1.434956,-1.37353,0.140405,0.593381,1.150435,-0.398818,-1.455174,-0.605823,-0.708901,-0.140196,-0.25223,-0.248141,-0.426822,-0.36649,-0.339543,-0.246157,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-2.810518,0.934026,-0.629817,0.904391,-1.729491,1.433817,-0.180667,-0.351951,-0.52783,-0.747116,-0.556835,0.556835,-0.158336,-0.51731,-0.336188,-0.189588,1.362487,-0.529545,-0.069165,-0.169252,-0.139818,-0.08578,2.435516,-0.76387,-0.355806,-0.774395,-0.870791,0.263575,-0.352854,-0.266567,0.028638,-0.867153
4,-0.399837,-1.158312,-0.412535,-0.251549,1.410573,-0.000498,-0.471848,0.084566,-0.452831,-0.255067,-0.281771,0.271914,-0.540717,0.057799,-0.06882,0.679642,0.642383,0.363471,-0.698277,0.475519,0.015466,-0.644303,-0.082562,-0.50006,-0.300638,0.201813,-0.319699,0.029183,0.224494,-0.630851,-0.267795,-0.970809,-1.344315,0.26293,0.617738,0.861722,-0.850942,0.265267,0.653177,0.089719,-0.867209,-0.367169,-0.152979,0.718525,0.662542,0.684499,-0.196833,-0.134021,-0.248141,-0.426822,-0.589078,-0.339543,-0.246157,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-0.355806,-2.810518,-1.608466,-0.629817,-0.28411,-0.331069,-0.447309,-2.17073,-0.425143,-0.424231,-0.143172,1.795865,-1.795865,-0.158336,-0.51731,-0.336188,-0.189588,-0.733952,1.888415,-0.069165,-0.169252,-0.139818,-0.08578,-0.410591,1.309124,-0.355806,-0.774395,-0.864661,0.056037,-0.352854,-0.266567,-1.17307,-0.325168


In [21]:
# Export datasets
out_train = X_train.copy()
out_train[["value_eur", "log_value_eur"]] = train[["value_eur", "log_value_eur"]]

out_train.to_csv(f"{preprocessed_folder}/training.csv",index=False)
out_test = X_test.copy()
out_test.to_csv(f"{preprocessed_folder}/testing.csv",index=False)