In [0]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

import keras
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout,Activation,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import SGD
from sklearn.svm import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve, validation_curve, cross_val_score

from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import  roc_auc_score
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

import pandas as pd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def add_fields(df, with_y):
  df = df.fillna(0)
  df2 = pd.DataFrame()

  df2['diff_bid1'] = df['last_price'] - df['bid1']
  df2['diff_bid2'] = df['last_price'] - df['bid2']
  df2['diff_bid3'] = df['last_price'] - df['bid3']
  df2['diff_bid4'] = df['last_price'] - df['bid4']
  df2['diff_bid5'] = df['last_price'] - df['bid5']

  df2['diff_ask1'] = df['last_price'] - df['ask1']
  df2['diff_ask2'] = df['last_price'] - df['ask2']
  df2['diff_ask3'] = df['last_price'] - df['ask3']
  df2['diff_ask4'] = df['last_price'] - df['ask4']
  df2['diff_ask5'] = df['last_price'] - df['ask5']

  df2['diff_mid_bid1'] = df['mid'] - df['bid1']
  df2['diff_mid_bid2'] = df['mid'] - df['bid2']
  df2['diff_mid_bid3'] = df['mid'] - df['bid3']
  df2['diff_mid_bid4'] = df['mid'] - df['bid4']
  df2['diff_mid_bid5'] = df['mid'] - df['bid5']

  df2['diff_mid_ask1'] = df['mid'] - df['ask1']
  df2['diff_mid_ask2'] = df['mid'] - df['ask2']
  df2['diff_mid_ask3'] = df['mid'] - df['ask3']
  df2['diff_mid_ask4'] = df['mid'] - df['ask4']
  df2['diff_mid_ask5'] = df['mid'] - df['ask5']

  df2['mid_diff'] = df['last_price'] - df['mid']
  df2['best_offer_last_price_diff'] = df2['diff_bid1'] + df2['diff_ask1']

  df['avg_bid'] = (df['bid1'] * df['bid1vol'] + df['bid2'] * df['bid2vol'] + df['bid3'] * df['bid3vol'] + df['bid4'] * df['bid4vol'] + df['bid5'] * df['bid5vol']) / (df['bid1vol'] + df['bid2vol'] + df['bid3vol'] + df['bid4vol'] + df['bid5vol'])
  df['avg_ask'] = (df['ask1'] * df['ask1vol'] + df['ask2'] * df['ask2vol'] + df['ask3'] * df['ask3vol'] + df['ask4'] * df['ask4vol'] + df['ask5'] * df['ask5vol']) / (df['ask1vol'] + df['ask2vol'] + df['ask3vol'] + df['ask4vol'] + df['ask5vol'])

  df2['avg_bid_last_price_diff'] = df['avg_bid'] - df['last_price']
  df2['avg_bid_mid_diff'] = df['avg_bid'] - df['mid']
  df2['avg_ask_last_price_diff'] = df['avg_ask'] - df['last_price']
  df2['avg_ask_mid_diff'] = df['avg_ask'] - df['mid']

  df2['avg_bid_best_bid_diff'] = df['avg_bid'] - df['bid1']
  df2['avg_ask_best_ask_diff'] = df['avg_ask'] - df['ask1']
  df2['avg_bid_best_ask_diff'] = df['avg_bid'] - df['ask1']
  df2['avg_ask_best_bid_diff'] = df['avg_ask'] - df['bid1']

  df2['best_offer_diff'] = df['bid1'] - df['ask1']
  df2['best_offer_vol_diff'] = df['bid1vol'] - df['ask1vol']
  df2['best_offer_tot_diff'] = df['bid1'] * df['bid1vol'] - df['ask1'] * df['ask1vol']

  df['avg_best_offer'] = (df['bid1'] * df['bid1vol'] + df['ask1'] * df['ask1vol']) / (df['bid1vol'] + df['ask1vol'])
  df2['mid_avg_best_offer_diff'] = df['mid'] - df['avg_best_offer']
  df2['last_price_avg_best_offer_diff'] = df['last_price'] - df['avg_best_offer']

  df2['tot_bid_vol'] = df['bid1vol'] + df['bid2vol'] + df['bid3vol'] + df['bid4vol'] + df['bid5vol'] 
  df2['tot_ask_vol'] = df['ask1vol'] + df['ask2vol'] + df['ask3vol'] + df['ask4vol'] + df['ask5vol'] 
  df2['tot_offer_vol_diff'] = df2['tot_bid_vol'] - df2['tot_ask_vol']

  df2['closed_tot_bid_diff'] = df2['tot_bid_vol'] - df['closed_position_qty']
  df2['closed_tot_ask_diff'] = df2['tot_ask_vol'] - df['closed_position_qty']
  df2['opened_tot_bid_diff'] = df2['tot_bid_vol'] - df['opened_position_qty ']
  df2['opened_tot_ask_diff'] = df2['tot_ask_vol'] - df['opened_position_qty ']

  df2['tot_opened_bids'] = df['opened_position_qty '] + df2['tot_bid_vol']
  df2['tot_closed_asks'] = df['closed_position_qty'] + df2['tot_ask_vol']
  df2['tot_orders_diff'] = df2['tot_offer_vol_diff'] + df['d_open_interest']

  df2['d_open_interest'] = df['d_open_interest'] 
  df2['transacted_qty'] = df['transacted_qty'] 

  if with_y == True:
    y = df['y']
  else:
    y = []

  X = df2.loc[:, df2.columns != 'y']  
  return X, y

# Create Training Set


In [0]:
train_df = pd.read_csv('/content/drive/My Drive/cs155/train.csv', index_col=0)
train_df, y = add_fields(train_df, with_y=True)

X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=0)


# Train Ensemble Model


In [0]:
import itertools

test_df = pd.read_csv('/content/drive/My Drive/cs155/test.csv', index_col=0)
test_df, __ = add_fields(test_df, with_y=False)

clf1 = GradientBoostingClassifier()
clf2 = RandomForestClassifier()
clf3 = LGBMClassifier()
clf4 = XGBClassifier()
clf5 = AdaBoostClassifier()
clf6 = LogisticRegression()
clf7 = LinearDiscriminantAnalysis()

voting_classifier_list = []
ensemble_id = []
train_roc = []
train_accuracy_score = []

min_num_models = 3
df_record = pd.DataFrame()

model_list = [('gb', clf1), ('rf', clf2), ('lgbm', clf3),('xgb', clf4), ('ada', clf5), ('logreg', clf6), ('lda', clf7)]

for x in range(min_num_models, len(model_list) + 1):
  for model_group in itertools.combinations(model_list, x):



    # model_group is a n-tuple; reformatting to a list before sending to classifier
    estimators = []
    ensemble = ""
    for model in model_group:
      ensemble += (str(model[0]) + "_") # construct ensemble id
      estimators.append(model)
    
    ensemble = ensemble[:-1] # take out last character "_"
    ensemble_id.append(ensemble)

    print(ensemble_id)



    # eclf1 = VotingClassifier(estimators=estimators, voting='soft')
    # eclf1 = eclf1.fit(X_train, y_train)

    # y_prob = eclf1.predict_proba(X_test)
    # roc_train = roc_auc_score(y_test, y_prob[:, 1])
    # train_accuracy_score.append(eclf1.score(X_test, y_test))
    # train_roc.append(roc_train)

    # # fit on entire training set
    # eclf1 = eclf1.fit(train_df, y)

    # # get predicted probabilities on test set
    # probbs = eclf1.predict_proba(test_df)

    # df_results = pd.DataFrame()
    # df_results['id'] = test_df.index
    # df_results['Predicted'] = probbs[:, 1]

    # header = ["id", "Predicted"]
    
    # df_results.to_csv(ensemble + '.csv', columns = header, index=False)

#     voting_classifier_list.append(eclf1)

  

# df_record['id'] = ensemble_id
# df_record['train_roc'] = train_roc
# df_record['train_accuracy_score'] = train_accuracy_score 
# header = ["id", "train_roc", "train_accuracy_score"]
# df_record.to_csv('ensemble_record.csv', columns = header, index = False)

    


['gb_rf_lgbm']
['gb_rf_lgbm', 'gb_rf_xgb']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg', 'gb_rf_lda']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg', 'gb_rf_lda', 'gb_lgbm_xgb']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg', 'gb_rf_lda', 'gb_lgbm_xgb', 'gb_lgbm_ada']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg', 'gb_rf_lda', 'gb_lgbm_xgb', 'gb_lgbm_ada', 'gb_lgbm_logreg']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg', 'gb_rf_lda', 'gb_lgbm_xgb', 'gb_lgbm_ada', 'gb_lgbm_logreg', 'gb_lgbm_lda']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg', 'gb_rf_lda', 'gb_lgbm_xgb', 'gb_lgbm_ada', 'gb_lgbm_logreg', 'gb_lgbm_lda', 'gb_xgb_ada']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', 'gb_rf_logreg', 'gb_rf_lda', 'gb_lgbm_xgb', 'gb_lgbm_ada', 'gb_lgbm_logreg', 'gb_lgbm_lda', 'gb_xgb_ada', 'gb_xgb_logreg']
['gb_rf_lgbm', 'gb_rf_xgb', 'gb_rf_ada', '

In [0]:
import itertools

test_df = pd.read_csv('/content/drive/My Drive/cs155/test.csv', index_col=0)
test_df, __ = add_fields(test_df, with_y=False)

clf1 = GradientBoostingClassifier()
clf2 = RandomForestClassifier()
clf3 = LGBMClassifier()
clf4 = XGBClassifier()
clf5 = AdaBoostClassifier()
clf6 = LogisticRegression()
clf7 = LinearDiscriminantAnalysis()

voting_classifier_list = []
ensemble_id = []
train_roc = []
train_accuracy_score = []

min_num_models = 3
df_record = pd.DataFrame()

model_list = [('gb', clf1), ('rf', clf2), ('lgbm', clf3),('xgb', clf4), ('ada', clf5), ('logreg', clf6), ('lda', clf7)]

for x in range(min_num_models, len(model_list) + 1):
  for model_group in itertools.combinations(model_list, x):

    # model_group is a n-tuple; reformatting to a list before sending to classifier
    estimators = []
    ensemble = ""
    for model in model_group:
      ensemble += (str(model[0]) + "_") # construct ensemble id
      estimators.append(model)
    
    ensemble = ensemble[:-1] # take out last character "_"
    ensemble_id.append(ensemble)

    eclf1 = VotingClassifier(estimators=estimators, voting='soft')
    eclf1 = eclf1.fit(X_train, y_train)

    y_prob = eclf1.predict_proba(X_test)
    roc_train = roc_auc_score(y_test, y_prob[:, 1])
    train_accuracy_score.append(eclf1.score(X_test, y_test))
    train_roc.append(roc_train)

    # # fit on entire training set
    # eclf1 = eclf1.fit(train_df, y)

    # # get predicted probabilities on test set
    # probbs = eclf1.predict_proba(test_df)

    # df_results = pd.DataFrame()
    # df_results['id'] = test_df.index
    # df_results['Predicted'] = probbs[:, 1]

    # header = ["id", "Predicted"]
    
    # df_results.to_csv(ensemble + '.csv', columns = header, index=False)

    voting_classifier_list.append(eclf1)

  

df_record['id'] = ensemble_id
df_record['train_roc'] = train_roc
df_record['train_accuracy_score'] = train_accuracy_score 
header = ["id", "train_roc", "train_accuracy_score"]
df_record.to_csv('ensemble_record.csv', columns = header, index = False)

    


In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Create .csv of Test Set



In [0]:
# test_df = pd.read_csv('/content/drive/My Drive/CS 155/Kaggle 1/test.csv', index_col=0)
# test_df, __ = add_fields(test_df, with_y=False)

# eclf1 = eclf1.fit(train_df, y)
# probbs = eclf1.predict_proba(test_df)
# test_df['Predicted'] = probbs[:, 1]

# header = ["id", "Predicted"]
# test_df['id']=test_df.index
# test_df.to_csv('output.csv', columns = header,index=False)

In [0]:
# df_t = pd.DataFrame()
# df_t['test'] = [1,2,3]

# header = ['test']
# df_t.to_csv('out.csv', columns = header)
