In [119]:
import torch
import pandas as pd
from utils import df_filter_times, _ohlc

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV


from sklearn.metrics import precision_recall_curve
import plotly.graph_objects as go

# Prep data

In [3]:
df_all = pd.read_csv("./data/1m-10d.csv",parse_dates=["datetime"])
df_market = df_filter_times(df_all)

In [1]:
stable = df_all.groupby('name').first()['open'].sort_values(ascending=True)
stable = stable[stable.between(130,170)]

NameError: name 'df_all' is not defined

In [76]:
def scale_group(group):
    scaler = MinMaxScaler()
    group[_ohlc] = scaler.fit_transform(group[_ohlc])
    return group

In [109]:
df_train = df_market[df_market['name'].isin(('AAPL','NFLX','DLTR'))].copy()


df_test = df_market[df_market['name'].isin(('MSFT',))].copy()

print(len(df_train), len(df_test))

11681 3910


In [100]:
def get_pairs(df):
    df['green'] = df['close'] > df['open']+0.01
    df['label'] = df['green'].shift(-1)
    df = df.dropna()

    df = df.groupby('name').apply(scale_group,include_groups=False).reset_index(level=0)

    X = df[['open','high','low','close']]
    y = df['label'].astype(int)
    return X,y

In [101]:
x_train, y_train = get_pairs(df_train)
x_test, y_test = get_pairs(df_test)
y_train.mean(),1-y_train.mean()

(0.42962328767123287, 0.5703767123287671)

In [102]:
model = RandomForestClassifier(max_depth=10)
model.fit(x_train,y_train)


y_pred = model.predict(x_train)
# Evaluate the classifier
accuracy = accuracy_score(y_train, y_pred)
report = classification_report(y_train, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.6490582191780822
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.99      0.76      6662
           1       0.93      0.20      0.33      5018

    accuracy                           0.65     11680
   macro avg       0.78      0.59      0.54     11680
weighted avg       0.75      0.65      0.58     11680



In [103]:
y_pred = model.predict(x_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.5326170376055257
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.94      0.69      2116
           1       0.42      0.05      0.09      1793

    accuracy                           0.53      3909
   macro avg       0.48      0.50      0.39      3909
weighted avg       0.48      0.53      0.41      3909



In [129]:
# Assuming you have a trained model and test data
y_true = y_train
y_scores = mh.predict_proba(x_train)[:, 1]  # Probability scores from the model

# Generate precision-recall curve
precision, recall, _ = precision_recall_curve(y_true, y_scores)

# Plot precision-recall curve using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=recall, y=precision, mode='lines', name='Precision-Recall Curve'))
fig.update_layout(
    title='Precision-Recall Curve',
    xaxis_title='Recall',
    yaxis_title='Precision',
    yaxis=dict(range=[0, 1.05]),
    xaxis=dict(range=[0, 1.05])
)
fig.show()

In [126]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.05, 0.2],  # Step size shrinkage used to prevent overfitting
    'n_estimators': [50, 100, 200],  # Number of boosting stages (trees) to be run
    'max_depth': [3, 7],  # Maximum depth of each tree
}

# Initialize the model (replace with your chosen model)
model = GradientBoostingClassifier()

# Initialize GridSearchCV with the model, parameter grid, and cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='precision')

# Perform grid search
grid_search.fit(x_train, y_train)  # X: features, y: target variable

# Print best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}
Best cross-validation score: 0.4179060695896396


In [127]:
mh = grid_search.best_estimator_