# Collecting the data
First we have to get all the needed data. We will use web3py with Pancake bet contract (0x0E3A8078EDD2021dadcdE733C6b4a86E51EE8f07) and requests with bscscan API.

Add needed modules folders:

In [None]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

if module_path + '/src' not in sys.path:
    sys.path.append(module_path + '/src')

# os.chdir("..")

print(os.getcwd())

## Getting active players
Use get_active_players.py main() with start and end time. It may be required to change the main function loop to iterate through more past transactions.
Also change CURRENT_BLOCK_NUMBER to the current one - to start checking transactions before this block.

In [None]:
import datetime

import get_active_players

# The same timeframe as for the training set
time_from_active_players = int(datetime.datetime(2022, 10, 28, 0, 0).timestamp())
time_to_active_players = int(datetime.datetime(2022, 12, 28, 0, 0).timestamp())
active_players_file = f'../data/active_players_2months.csv'

get_active_players.make_active_players_file(time_from_active_players, time_to_active_players, 70, active_players_file)

List of active players and their occurence within given timeframe:

In [None]:
import pandas as pd
active_players_df = pd.read_csv(active_players_file, index_col=False)

active_players_df

Let's select only players that have made more than 100 bets in the selected timeframe.

In [None]:
# List of players to download data of (you can get them from the "Getting active players" section or from the Pancake Prediction leaderboard)
main_wallets_list = active_players_df.loc[active_players_df['count'] > 100, 'player'].tolist()
print(f"Selected {len(main_wallets_list)} wallets")


## Downloading and preparing the data

### All needed variables

In [None]:
# Timeframe to do all the computing (Training + Validation + Test sets)
time_from_get_bets = int(datetime.datetime(2022, 10, 28, 0, 0).timestamp())
time_to_get_bets = int(datetime.datetime(2023, 1, 28, 0, 0).timestamp())

  # to get all the transactions from bscscan api (depends on the time_from_get_bets)
block_from_get_bets = 22394098
block_to_get_bets = 25624791

# Downloading players bet history settings
players_data_folder = '../data/players_data/'

# Download rounds settings (check the rounds range according to the time_from_get_bets and time_to_get_bets values in the blockchain)
from_round = 36000
to_round = 65000
rounds_file = '../data/rounds_data/final_rounds_data.csv'

# Final data
final_data_folder = '../data/merged_data/'

### Downloading players bet history
We have to collect every needed player transaction data and save to separate JSON files.

In [None]:
import download_players_bets
print(len(main_wallets_list))
download_players_bets.main_concurrent(main_wallets_list, players_data_folder, block_from_get_bets, block_to_get_bets)

### Downloading rounds history
Use web3py to download all the rounds info and save it to the file.

In [None]:
import download_rounds
downloader = download_rounds.RoundsDownloader()

data = downloader.download_rounds(from_round, to_round)  # Download rounds FROM-TO, we only need the last few months
downloader.save_rounds(rounds_file, data)

## Preparing the data
Before using all the data we have to create dataframes in CSV files with all the needed stuff in one place
The data will be stored in 2 saparate CSV files - one containing the info about the bet type of each analyzed player, the other one about the bet size

In [None]:
import analyze_players

rounds_df = analyze_players.load_rounds_data(rounds_file)
    
analyze_players.create_final_csv_files(players_data_folder, final_data_folder, rounds_df)

final_bet_amount.csv file contains data about all the rounds and separate columns for each players bet amount

In [None]:
final_bet_amount_df = pd.read_csv(f'{final_data_folder}final_bet_amount.csv')
final_bet_amount_df

final_bet_amount.csv file contains data about all the rounds and separate columns for each players bet amount

In [None]:
final_player_bet_df = pd.read_csv(f'{final_data_folder}final_player_bet.csv', low_memory=False)
final_player_bet_df

## Selecting the best players
We have collected the data about the active players in the given dataframe, but not all players are worth following. Let's check what players have high win ratio or made the most money

### Selecting the timeframe for training&validation&test sets
We have downloaded the data in timeframe:
2022.10.28 - 2023.01.28

Let's make the first 2 months to be the training set, 1/2 month validation set and 1/2 month test set

In [None]:
import datetime
import pandas as pd

time_from_training = int(datetime.datetime(2022, 10, 28, 0, 0).timestamp())
time_to_training = int(datetime.datetime(2022, 12, 28, 0, 0).timestamp())

time_from_validation = int(datetime.datetime(2022, 12, 28, 0, 0).timestamp())
time_to_validation = int(datetime.datetime(2023, 1, 13, 0, 0).timestamp())

time_from_test = int(datetime.datetime(2023, 1, 13, 0, 0).timestamp())
time_to_test = int(datetime.datetime(2023, 1, 28, 0, 0).timestamp())


### Players metrics
Let's check how many bets were actually good for each player (win ratio) and how many CAKE tokens each player earned in total and per bet. To not affect the test set, consider only data in the training set timeframe.

In [None]:
import check_players_results
import utils

player_bet_df, bet_amount_df = utils.load_players_data(time_from_training, time_to_training)

players_metrics_df = check_players_results.get_players_metrics(player_bet_df, bet_amount_df)
players_metrics_df.sort_values(by='total_profit', ascending=False)

In [None]:
players_metrics_df[['total_profit', 'profit_per_bet']].hist(bins=20)

# Training the models

Now we can use all the collected data for simulation purposes.


## Simulating the environment
Since placing a bet affects the game environment (when you add money to the pool, the payout multipliers change) we have to use a simulator

### Copytrading each player
Copytrading one player at a time, works similar to just calculating his overall profit - but if we place a bet we make changes to the environment and change the pool. So if we place the same bet as player X, both of us will make less money (assuming we win) than if he played alone.

In [None]:
import simulator
player_bet_df, bet_amount_df = utils.load_players_data(time_from_training, time_to_training)

# Get all wallets from player_bet_df
wallets = player_bet_df.columns.to_list()
not_players_list = ['epoch', 'start_timestamp', 'lock_timestamp', 'close_timestamp', 'lock_price', 'close_price',
                    'total_amount', 'bull_amount', 'bear_amount', 'position']
wallets = [wallet for wallet in wallets if wallet not in not_players_list]

data = []
for wallet in wallets:
    trades_data = simulator.copy_trade_player(player_bet_df, bet_amount_df, wallet)
    data.append({'wallet': wallet, 'profit': trades_data['profit'].sum()})

df = pd.DataFrame(data)

df.sort_values(by='profit', ascending=False)

In [None]:
# Histogram with the profit results, we can see the curve is Gaussian-like
df['profit'].hist(bins=50)

## Preparing the data to be used in the ml

### Exploring the data
For learning process we will use the data from the 2 previous CSV files. 

In [None]:
train_player_bet_df, train_bet_amount_df = utils.load_players_data(time_from_training, time_to_training)
train_player_bet_df = train_player_bet_df.drop(columns=['start_timestamp', 'lock_timestamp', 'close_timestamp', 'lock_price', 'close_price', 'total_amount', 'bull_amount', 'bear_amount', 'position'])
train_df = train_player_bet_df.merge(train_bet_amount_df, on = 'epoch', suffixes=['_bet','_amount'])
train_df

In [None]:
validate_player_bet_df, validate_bet_amount_df = utils.load_players_data(time_from_validation, time_to_validation)
validate_player_bet_df = validate_player_bet_df.drop(columns=['start_timestamp', 'lock_timestamp', 'close_timestamp', 'lock_price', 'close_price', 'total_amount', 'bull_amount', 'bear_amount', 'position'])
validate_df = validate_player_bet_df.merge(validate_bet_amount_df, on = 'epoch', suffixes=['_bet','_amount'])
validate_df

In [None]:
test_player_bet_df, test_bet_amount_df = utils.load_players_data(time_from_test, time_to_test)
test_player_bet_df = test_player_bet_df.drop(columns=['start_timestamp', 'lock_timestamp', 'close_timestamp', 'lock_price', 'close_price', 'total_amount', 'bull_amount', 'bear_amount', 'position'])
test_df = test_player_bet_df.merge(test_bet_amount_df, on = 'epoch', suffixes=['_bet','_amount'])
test_df

Let's check if the dataset is balanced

In [None]:
train_df['position'].value_counts()

In [None]:
validate_df['position'].value_counts()

In [None]:
test_df['position'].value_counts()

The dataset seems to be well balanced

### Cleaning the data before the training
1. Because NaN values provide no information we can delete all the rows that contain only NaNs. This would work in the production environment as if we get no information about the current round we can simply pass that round (so we don't place any bet).
2. For the simplicity we will remove all the rows with 'position' == 'House' so we end up with a binary classification problem. Later, when finally testing the model we will add this category once again (when its House you lose no matter if you place a bet for Bull or Bear)

### Choosing the feature sets
We will check few types of features sets to check which one provides the most useful information about the label.
- total pool sizes for Bear/Bull bets (no additional info about the player: 2 columns in total (these features will be added to ALL feture sets below)s)
- all 247 players bet info (Bull/Bear): 247 columns in total
- all 247 players bet info (Bull/Bear) + their bet size info (in CAKE tokens): 247*2 = 494 columns in total
- the best 10% of players bet info (Bull/Bear) according to their overall profit
- the best 10% of players bet info (Bull/Bear) according to their overall profit + their bet size info (in CAKE tokens)
- the best 20% of players bet info (Bull/Bear) according to their overall profit
- the best 20% of players bet info (Bull/Bear) according to their overall profit + their bet size info (in CAKE tokens)
- total amount of CAKE tokens bets for Bull/Bear of the best 10% of players + total number of bets (Bull/Bear) of the best 10% of players: 4 columns in total
- total amount of CAKE tokens bets for Bull/Bear of the best 20% of players + total number of bets (Bull/Bear) of the best 20% of players: 4 columns in total

In [None]:
# Lists of the best players (by total profit)
# There are 247 wallets in total, lets check how many is 10% and 20%
p10 = int(len(players_metrics_df)*0.1)
p20 = int(len(players_metrics_df)*0.2)
print(p10, p20)

best_10p_players_list = players_metrics_df.sort_values(by='total_profit', ascending=False).reset_index(drop=True).loc[:p10-1, 'player'].values.tolist()
best_20p_players_list = players_metrics_df.sort_values(by='total_profit', ascending=False).reset_index(drop=True).loc[:p20-1, 'player'].values.tolist()

In [None]:
# Get the needed columns names in the lists
all_players_bet_cols = [col+'_bet' for col in main_wallets_list]
all_players_amount_cols = [col+'_amount' for col in main_wallets_list]

best_10p_players_bet_cols = [col+'_bet' for col in best_10p_players_list]
best_10p_players_amount_cols = [col+'_amount' for col in best_10p_players_list]

best_20p_players_bet_cols = [col+'_bet' for col in best_20p_players_list]
best_20p_players_amount_cols = [col+'_amount' for col in best_20p_players_list]

In [None]:
def prepare_data(df, cols_to_leave_arg):
    cols_to_leave = cols_to_leave_arg.copy()  # Make a copy of the list to not remove the elements from the original list

    df = df[df["position"] != "House"]  # To make the problem binary classification
    df = df[df["total_amount"] != 0]  # Do not consider the rounds where no one bet
    df = df[cols_to_leave]

    # Remove nan rows (where none from selected players placed any bet)
    not_nan_cols = ['position', 'bull_amount', 'bear_amount']
    for element in not_nan_cols:
        cols_to_leave.remove(element)

    if len(cols_to_leave) > 0:
        df = df.dropna(subset=cols_to_leave, how='all')

    # Get the list of the categorical cols and do one hot encoding
    categorical_cols = df.select_dtypes(include=['category']).columns.to_list()
    categorical_cols.remove('position')

    for col in categorical_cols:
        one_hot_encoded_cols = pd.get_dummies(df[col], prefix_sep='_', prefix=col)

        # Drop column with 'House' in the name (as the player can only bet on Bull or Bear)
        one_hot_encoded_cols = one_hot_encoded_cols.drop([col + '_House'], axis=1)
        df = df.drop([col], axis=1).join(one_hot_encoded_cols)

    df = df.replace("Bull", 1)
    df = df.replace("Bear", 0)

    df = df.replace("House", np.nan)  # Although the House rows are dropped, we have to replace it to remove the category
    df = df.astype(float)

    df.reset_index(drop=True, inplace=True)

    return df

In [None]:
feature_set1 = ['bull_amount', 'bear_amount', 'position']
feature_set1_train_df = prepare_data(train_df, feature_set1)
feature_set1_validate_df = prepare_data(validate_df, feature_set1)
feature_set1_test_df = prepare_data(train_df, feature_set1)

feature_set2 = all_players_bet_cols.copy() # Keep the features from the feature_set_1 as they have predicting power
feature_set2.extend(feature_set1)
feature_set2_train_df = prepare_data(train_df, feature_set2)
feature_set2_validate_df = prepare_data(validate_df, feature_set2)
feature_set2_test_df = prepare_data(train_df, feature_set2)

feature_set3 = all_players_bet_cols.copy() + all_players_amount_cols.copy()
feature_set3.extend(feature_set1)
feature_set3_train_df = prepare_data(train_df, feature_set3)
feature_set3_validate_df = prepare_data(validate_df, feature_set3)
feature_set3_test_df = prepare_data(train_df, feature_set3)

feature_set4 = best_10p_players_bet_cols.copy()
feature_set4.extend(feature_set1)
feature_set4_train_df = prepare_data(train_df, feature_set4)
feature_set4_validate_df = prepare_data(validate_df, feature_set4)
feature_set4_test_df = prepare_data(train_df, feature_set4)

feature_set5 = best_10p_players_bet_cols.copy() + best_10p_players_amount_cols.copy()
feature_set5.extend(feature_set1)
feature_set5_train_df = prepare_data(train_df, feature_set5)
feature_set5_validate_df = prepare_data(validate_df, feature_set5)
feature_set5_test_df = prepare_data(train_df, feature_set5)

feature_set6 = best_20p_players_bet_cols.copy()
feature_set6.extend(feature_set1)
feature_set6_train_df = prepare_data(train_df, feature_set6)
feature_set6_validate_df = prepare_data(validate_df, feature_set6)
feature_set6_test_df = prepare_data(train_df, feature_set6)

feature_set7 = best_20p_players_bet_cols.copy() + best_20p_players_amount_cols.copy()
feature_set7.extend(feature_set1)
feature_set7_train_df = prepare_data(train_df, feature_set7)
feature_set7_validate_df = prepare_data(validate_df, feature_set7)
feature_set7_test_df = prepare_data(train_df, feature_set7)

In [None]:
def create_aggregated_df(df, bet_type_columns, bet_size_columns, additional_columns):
    filtered_df = df[bet_size_columns + bet_type_columns + ['epoch', 'total_amount'] + additional_columns].copy()

    # Calculate Bear number for each row
    filtered_df['best_bear_number'] = filtered_df[bet_type_columns].isin(['Bear']).sum(axis=1)
    filtered_df['best_bull_number'] = filtered_df[bet_type_columns].isin(['Bull']).sum(axis=1)

    filtered_df['best_bear_amount'] = 0
    filtered_df['best_bull_amount'] = 0

    # Replace NaN with 0 in the bet size columns
    filtered_df[bet_size_columns] = filtered_df[bet_size_columns].fillna(0)

    # Iterate through each wallet's columns to aggregate the data
    for bet_type_column in bet_type_columns:
        bet_size_column = bet_type_column.replace('_bet', '_amount')

        wallet_df = filtered_df[['epoch', bet_size_column, bet_type_column]]

        bear_bets = np.where(wallet_df[bet_type_column] == 'Bear', wallet_df[bet_size_column], 0)
        bull_bets = np.where(wallet_df[bet_type_column] == 'Bull', wallet_df[bet_size_column], 0)

        filtered_df['best_bear_amount'] += bear_bets
        filtered_df['best_bull_amount'] += bull_bets

    # Drop rows where all columns are 0
    filtered_df = filtered_df[(filtered_df[bet_size_columns] != 0).any(axis=1)]

    return filtered_df

In [None]:
# We need separate function for these 2
feature_set8 = ['best_bear_number', 'best_bull_number', 'best_bear_amount', 'best_bull_amount']
feature_set8.extend(feature_set1)
feature_set8_train_df = create_aggregated_df(train_df, best_10p_players_bet_cols, best_10p_players_amount_cols, feature_set1.copy())
feature_set8_validate_df = create_aggregated_df(validate_df, best_10p_players_bet_cols, best_10p_players_amount_cols, feature_set1.copy())
feature_set8_test_df = create_aggregated_df(test_df, best_10p_players_bet_cols, best_10p_players_amount_cols, feature_set1.copy())

feature_set8_train_df = prepare_data(feature_set8_train_df, feature_set8)
feature_set8_validate_df = prepare_data(feature_set8_validate_df, feature_set8)
feature_set8_test_df = prepare_data(feature_set8_test_df, feature_set8)

feature_set9 = feature_set8
feature_set9_train_df = create_aggregated_df(train_df, best_20p_players_bet_cols, best_20p_players_amount_cols, feature_set1.copy())
feature_set9_validate_df = create_aggregated_df(validate_df, best_20p_players_bet_cols, best_20p_players_amount_cols, feature_set1.copy())
feature_set9_test_df = create_aggregated_df(test_df, best_20p_players_bet_cols, best_20p_players_amount_cols, feature_set1.copy())

feature_set9_train_df = prepare_data(feature_set9_train_df, feature_set9)
feature_set9_validate_df = prepare_data(feature_set9_validate_df, feature_set9)
feature_set9_test_df = prepare_data(feature_set9_test_df, feature_set9)

Print the feature sets info

In [None]:
feature_sets_dfs_list = [feature_set1_train_df, feature_set2_train_df, feature_set3_train_df, feature_set4_train_df, feature_set5_train_df, feature_set6_train_df, feature_set7_train_df, feature_set8_train_df, feature_set9_train_df]

for i, feature_set_df in enumerate(feature_sets_dfs_list):
    print(f"--- Feature set {i+1} Total: {feature_set_df.shape[1]} ---")
    print(feature_set_df.columns.to_list())

### Removing the outliers
All possible outliers represent natural variations and they will be left in the dataset

In [None]:
train_df.describe()

In [None]:
validate_df.describe()

In [None]:
test_df.describe()

### Feature scaling
The columns containing info about each players bet type (Bull/Bear or None, so categorical data) have been One-Hot encoded. The rest of the columns containing info about each players bet size (so numerical data) need to be scaled. Let's check the distribution of some random columns to choose the scaler.

In [None]:
print(feature_set1_train_df.columns)
feature_set1_train_df.iloc[:, 0].hist(range=(0,100), bins=20)

In [None]:
print(feature_set3_train_df.columns)
feature_set3_train_df.iloc[:, 15].hist(range=(0,5), bins=20)

In [None]:
print(feature_set5_train_df.columns)
feature_set5_train_df.iloc[:, 25].hist(range=(0,100), bins=20)

In [None]:
print(feature_set7_train_df.columns)
feature_set7_train_df.iloc[:, 32].hist(range=(0,15), bins=20)

In [None]:
print(feature_set9_train_df.columns)
feature_set9_train_df.iloc[:, 0].hist(range=(0,15), bins=20)

In [None]:
feature_set9_train_df.iloc[:, 2].hist(range=(0,15), bins=20)

The data distribution seems to not be normal, so we will use the MinMaxScaler from the scikit-learn library. The scaler have to be fit on the training data, then used to scale the training, validation and test data.

In [None]:
from sklearn.preprocessing import MinMaxScaler

def scale_amount_columns(train_df, validate_df, test_df):
    # Train MinMaxScalers for each training dataset
    scaler = MinMaxScaler().set_output(transform="pandas")

    # Basically we want to scale all the columns that contain 'amount' substring
    columns_to_scale = [col for col in train_df.columns if 'amount' in col]

    # Fit the scalers on each feature set training data
    train_df[columns_to_scale] = scaler.fit_transform(train_df[columns_to_scale])
    validate_df[columns_to_scale] = scaler.transform(validate_df[columns_to_scale])
    test_df[columns_to_scale] = scaler.transform(test_df[columns_to_scale])

    return train_df, validate_df, test_df

feature_set1_train_df, feature_set1_validate_df, feature_set1_test_df = scale_amount_columns(feature_set1_train_df, feature_set1_validate_df, feature_set1_test_df)
feature_set2_train_df, feature_set2_validate_df, feature_set2_test_df = scale_amount_columns(feature_set2_train_df, feature_set2_validate_df, feature_set2_test_df)
feature_set3_train_df, feature_set3_validate_df, feature_set3_test_df = scale_amount_columns(feature_set3_train_df, feature_set3_validate_df, feature_set3_test_df)
feature_set4_train_df, feature_set4_validate_df, feature_set4_test_df = scale_amount_columns(feature_set4_train_df, feature_set4_validate_df, feature_set4_test_df)
feature_set5_train_df, feature_set5_validate_df, feature_set5_test_df = scale_amount_columns(feature_set5_train_df, feature_set5_validate_df, feature_set5_test_df)
feature_set6_train_df, feature_set6_validate_df, feature_set6_test_df = scale_amount_columns(feature_set6_train_df, feature_set6_validate_df, feature_set6_test_df)
feature_set7_train_df, feature_set7_validate_df, feature_set7_test_df = scale_amount_columns(feature_set7_train_df, feature_set7_validate_df, feature_set7_test_df)
feature_set8_train_df, feature_set8_validate_df, feature_set8_test_df = scale_amount_columns(feature_set8_train_df, feature_set8_validate_df, feature_set8_test_df)
feature_set9_train_df, feature_set9_validate_df, feature_set9_test_df = scale_amount_columns(feature_set9_train_df, feature_set9_validate_df, feature_set9_test_df)

In [None]:
feature_set2_train_df.describe()

### Threshold
As after removing the rounds with 'House' position we end up with binary classification problem, we want to leave it like that. But in production environment we can bet Bull/Bear or pass the round without betting and wait for for the model to give confident prediction.
We can get the probability for each class with sklearn function predict_proba. 0.5-0.5 means the model is completely not sure which class to choose, while 0.9-0.1 means its high probability that the record will be classified as the first class.
Let's add another hyperparameter - threshold. The model will return a class label only when the probability will be higher than the threshold. Threshold can be in range of 0.5-1, however we have to determine it experimentally.

## Testing the scikit-learn models

In [None]:
import numpy as np
import pandas as pd

def run(x_train, y_train, x_test, y_test, clf, threshold=0.7):
    clf.fit(x_train, y_train)

    total_accuracy = clf.score(x_test, y_test)

    predicted_probability = clf.predict_proba(x_test)

    # Create a df to compare the predicted_probability with the real y_test
    df = pd.DataFrame(predicted_probability, columns=["Bear", "Bull"])
    df["position"] = y_test.values
    df["predicted_position"] = np.where(df["Bear"] > threshold, 0, np.nan)
    df["predicted_position"] = np.where(df["Bull"] > threshold, 1, df["predicted_position"])

    df["correct"] = np.where(df["position"] == df["predicted_position"], 1, 0)

    # Get accuracy when probability is above threshold
    accuracy = df["correct"].sum() / (df['predicted_position'].count() + 0.0000001)  # Count only not NaN values

    return {'classifier': str(clf), 'threshold': threshold, 'total_accuracy': total_accuracy,
            'total_number_of_bets': len(y_test), 'accuracy_above_threshold': accuracy,
            'number_of_bets_above_threshold': df['predicted_position'].count()}


def main(train_df, validate_df, clfs, threshold_list, cols_to_leave):

    train_df = prepare_data(train_df, cols_to_leave)
    validate_df = prepare_data(validate_df, cols_to_leave)

    # X_train, y_train = train_df.drop(["position"], axis=1), train_df["position"]
    X_test, y_test = validate_df.drop(["position"], axis=1), validate_df["position"]
    X_train, y_train = train_df.drop(["position"], axis=1), train_df["position"]

    data_list = []

    for clf in clfs:
        for threshold in threshold_list:
            data = run(X_train, y_train, X_test, y_test, clf, threshold)
            data_list.append(data)

    df = pd.DataFrame(data_list)
    return df

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

clfs = [RandomForestClassifier(100), RandomForestClassifier(500), RandomForestClassifier(1000),
        DecisionTreeClassifier(criterion='entropy'), MLPClassifier(alpha=1, max_iter=1000), GaussianNB(),
        KNeighborsClassifier(3), KNeighborsClassifier(5), KNeighborsClassifier(7),
        KNeighborsClassifier(9), KNeighborsClassifier(11), LogisticRegression(max_iter=1000)]

svc_clfs = [SVC(kernel='rbf', probability=True), SVC(kernel='linear', probability=True),
            SVC(kernel='rbf', probability=True, C=10), SVC(kernel='linear', probability=True, C=10),
            SVC(kernel='rbf', probability=True, C=50), SVC(kernel='linear', probability=True, C=50)]

threshold_list = [0.6, 0.7, 0.8, 0.9, 0.95]

### Feature set #1: total pool sizes for Bear/Bull bets (no additional info about the players)

In [None]:
experiment1_df = main(train_df=train_df, validate_df=validate_df, clfs=clfs, threshold_list=threshold_list, cols_to_leave=feature_set1)

In [None]:
experiment1_df.sort_values(by='accuracy_above_threshold', ascending=False).head(20)

In [None]:
experiment2_df = main(train_df=train_df, validate_df=validate_df, clfs=svc_clfs, threshold_list=threshold_list, cols_to_leave=feature_set1)

In [None]:
experiment2_df.sort_values(by='accuracy_above_threshold', ascending=False).head(20)

### Feature set #2: all 247 players bet info (Bull/Bear): 247 columns in total

In [None]:
experiment3_df = main(train_df=train_df, validate_df=validate_df, clfs=clfs, threshold_list=threshold_list, cols_to_leave=feature_set2)

In [None]:
# This features set model takes much longer to train
experiment3_df.sort_values(by='accuracy_above_threshold', ascending=False).head(20)