In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import warnings

In [2]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

In [3]:
def clean(X, y, X_train):
    # These features are strings when they should be numeric
    wrong_fts = [feature for feature in ['P1 Rank', 'P2 Rank', 'P1 Set 2', 'P2 Set 2', 'P1 Set 3', 'P2 Set 3'] if feature in X]
    for feature in wrong_fts:
        X.loc[:, feature] = pd.to_numeric(X[feature], errors = 'coerce')
    # These features are very rarely missing so we can just drop the rows
    drop_fts = [feature for feature in ['Best of', 'P1 Rank', 'P2 Rank'] if feature in X]
    drop_rows = X[drop_fts].isna().any(axis = 1)
    drop_rows_indices = X.index[drop_rows]
    X = X.drop(drop_rows_indices).reset_index(drop = True)
    y = y.drop(drop_rows_indices).reset_index(drop = True)
    # These features are usually missing due to a walkover so it doesn't make much sense to fill them with anything other than 0
    fill_fts = [feature for feature in ['P1 Set 1', 'P2 Set 1', 'P1 Set 2', 'P2 Set 2',
                                        'P1 Set 3', 'P2 Set 3', 'P1 Set 4', 'P2 Set 4'] if feature in X]
    X.loc[:, fill_fts] = X[fill_fts].fillna(0)
    # These features are usually missing due to time so we just give a mean value
    mean_fts = [feature for feature in ['P1 Pts', 'P2 Pts', 'B365 P1', 'B365 P2', 'PS P1', 'PS P2'] if feature in X]
    for feature in mean_fts:
        ft_mean = X_train[feature].mean()
        X.loc[:, feature] = X[feature].fillna(ft_mean)
    return X, y

In [4]:
def ordinal_encode(X, X_train):
    encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
    encoder.fit(X_train)
    X = encoder.transform(X)
    X = pd.DataFrame(X, columns = X_train.columns)
    return X

In [5]:
def scale(X, X_train):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X = X.astype(float)
    X = scaler.transform(X)
    X = pd.DataFrame(X, columns = X_train.columns)
    return X

In [6]:
cat_fts = ['Location', 'Tournament', 'Series', 'Court', 'Surface', 'Round', 'Player 1', 'Player 2', 'Comment']
num_fts = ['ATP', 'Best of', 'P1 Rank', 'P2 Rank', 'P1 Pts', 'P2 Pts', 'B365 P1', 'B365 P2', 'PS P1', 'PS P2', 'Day', 'Month', 'Year']
mid_match_fts = ['P1 Set 1', 'P2 Set 1', 'P1 Set 2', 'P2 Set 2', 'P1 Set 3', 'P2 Set 3', 'P1 Set 4', 'P2 Set 4']
# Uncomment below to include mid-match features in model (only use when predicting a live match)
# num_fts = num_fts + mid_match_fts

In [7]:
df = pd.read_csv('df.csv')
X = df[num_fts + cat_fts]
y = df['Result']
X_train = X.loc[df['Year'] < df['Year'].max()]
# We must clean X, y, and X_train to ensure our data is consistent and to prepare for encoding
X, y = clean(X, y, X_train)
# We want to keep y at its current state so we store the y output of the function to the variable 'unused'
X_train, unused = clean(X_train, y, X_train)
# We must encode X and X_train to prepare for scaling
X[cat_fts] = ordinal_encode(X[cat_fts], X_train[cat_fts])
X_train[cat_fts] = ordinal_encode(X_train[cat_fts], X_train[cat_fts])
# We only have to scale X as we are done with X_train
X = scale(X, X_train)
df = pd.concat([X, y], axis = 1)
df.to_csv('preprocessed_df_logreg.csv', index = False)