In [1]:
import itertools
import numpy as np
import pandas as pd
from time import time
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [2]:
train_data = pd.read_csv('../data/train.csv')
train_data.head()

Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,target
0,1525471260,0.9012,0.9013,0.9012,0.9013,134.98,121.646459,4.0,125.08,112.723589,1.0
1,1525471320,0.90185,0.90195,0.90185,0.90195,1070.54,965.505313,12.0,879.94,793.612703,0.0
2,1525471380,0.9014,0.9014,0.90139,0.90139,2293.06,2066.963991,5.0,0.0,0.0,0.0
3,1525471440,0.90139,0.9014,0.90138,0.90139,6850.59,6175.000909,19.0,1786.3,1610.149485,0.0
4,1525471500,0.90139,0.90139,0.9013,0.9013,832.3,750.222624,3.0,784.82,707.4289,0.0


In [3]:
train_data['close_open_range'] = train_data['close'] - train_data['open']
train_data['high_low_range'] = train_data['high'] - train_data['low']

train_data['taker_buy_combined'] = train_data['taker_buy_base_volume'] + train_data['taker_buy_quote_volume']
train_data['close_open_trades'] = train_data['close_open_range'] * train_data['number_of_trades']
train_data['volume_price_interaction'] = train_data['quote_asset_volume'] * train_data['close_open_range']
train_data['lagged_close_open'] = train_data['close_open_range'].shift(1)
train_data['lagged_volume'] = train_data['quote_asset_volume'].shift(1)

train_data.dropna(inplace=True)

In [4]:
print(train_data.shape)
print(train_data.info())
print(train_data.isnull().sum())

(2122437, 18)
<class 'pandas.core.frame.DataFrame'>
Index: 2122437 entries, 1 to 2122437
Data columns (total 18 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   timestamp                 int64  
 1   open                      float64
 2   high                      float64
 3   low                       float64
 4   close                     float64
 5   volume                    float64
 6   quote_asset_volume        float64
 7   number_of_trades          float64
 8   taker_buy_base_volume     float64
 9   taker_buy_quote_volume    float64
 10  target                    float64
 11  close_open_range          float64
 12  high_low_range            float64
 13  taker_buy_combined        float64
 14  close_open_trades         float64
 15  volume_price_interaction  float64
 16  lagged_close_open         float64
 17  lagged_volume             float64
dtypes: float64(17), int64(1)
memory usage: 307.7 MB
None
timestamp                   0
open      

In [5]:
features = ['close', 'quote_asset_volume', 'taker_buy_combined', 'taker_buy_base_volume', 'number_of_trades', 'volume']
target = 'target'

scaler = StandardScaler()
train_data[features] = scaler.fit_transform(train_data[features])

In [6]:
X = train_data[features]
y = train_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

In [7]:
len(X_train)

636731

In [15]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
}

In [16]:
best_score = 0
best_params = None
param_combinations = list(ParameterGrid(param_grid))
total_combinations = len(param_combinations)

In [17]:
for idx, params in enumerate(param_combinations, 1):
    start = time()
    model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    
    if score > best_score:
        best_score = score
        best_params = params
    
    # Log progress
    duration = time() - start
    print(f"Completed {idx}/{total_combinations}: {params} with score {score:.4f} (Time: {duration:.2f}s)")

print("Best Params:", best_params)
print("Best Score:", best_score)

Completed 1/8: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50} with score 0.5260 (Time: 5.71s)
Completed 2/8: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100} with score 0.5260 (Time: 13.11s)
Completed 3/8: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 50} with score 0.5260 (Time: 6.00s)
Completed 4/8: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 100} with score 0.5260 (Time: 11.00s)
Completed 5/8: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50} with score 0.5261 (Time: 11.59s)
Completed 6/8: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100} with score 0.5263 (Time: 21.81s)
Completed 7/8: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50} with score 0.5260 (Time: 10.10s)
Completed 8/8: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100} with score 0.5260 (Time: 22.42s)
Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.526323512188818


In [18]:
rf_predictions = model.predict(X_test)

In [19]:
print("Accuracy: ", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

Accuracy:  0.5260408183045636
              precision    recall  f1-score   support

         0.0       0.53      0.93      0.67    779662
         1.0       0.51      0.08      0.14    706044

    accuracy                           0.53   1485706
   macro avg       0.52      0.51      0.41   1485706
weighted avg       0.52      0.53      0.42   1485706



In [20]:
test_data = pd.read_csv('../data/test.csv')

In [21]:
test_data['close_open_range'] = test_data['close'] - test_data['open']
test_data['high_low_range'] = test_data['high'] - test_data['low']

test_data['taker_buy_combined'] = test_data['taker_buy_base_volume'] + test_data['taker_buy_quote_volume']
test_data['close_open_trades'] = test_data['close_open_range'] * test_data['number_of_trades']
test_data['volume_price_interaction'] = test_data['quote_asset_volume'] * test_data['close_open_range']
test_data['lagged_close_open'] = test_data['close_open_range'].shift(1)
test_data['lagged_volume'] = test_data['quote_asset_volume'].shift(1)

test_data.dropna(inplace=True)

In [22]:
test_data[features] = scaler.fit_transform(test_data[features])

In [23]:
pred = model.predict(test_data[features])

In [43]:
sample = pd.read_csv('../data/sample_submission.csv')
sample.head()

Unnamed: 0,row_id,target
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1


In [49]:
pred = (pred > 0.5).astype(int)
# first row missing since we shifted
pred = list(pred)
pred.insert(0, pred[0])

   row_id  target
0       0       1
1       1       1
2       2       0
3       3       0
4       4       0


In [None]:
submission = pd.DataFrame({'row_id': range(len(pred)), 'target': pred})
print(submission.head())

In [46]:
submission.to_csv('../data/submission.csv', index = False)

In [47]:
print(len(submission))

909616
