In [128]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import joblib 

In [129]:
# ml imports
from sklearn.naive_bayes import MultinomialNB, ComplementNB
'''
from sklearn.tree import DecisionTreeClassifier, ExtraTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  # for classification
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, ADASYN
'''

'\nfrom sklearn.tree import DecisionTreeClassifier, ExtraTreeRegressor\nfrom sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\nfrom sklearn.neural_network import MLPClassifier\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC  # for classification\nfrom sklearn.linear_model import LogisticRegression\nfrom imblearn.over_sampling import SMOTE, ADASYN\n'

In [130]:
data = pd.read_csv('/home/defi/Desktop/portfolio/projects/python/pipeline_defi/clean_data/imitate_07.csv')
df = data

In [131]:
df.columns

Index(['Unnamed: 0', 'Unnamed:', 'open', 'high', 'ema-26', 'ema-12', 'low',
       'mean-grad-hist', 'close', 'volume', 'sma-25', 'long_jcrosk',
       'short_kdj', 'buy_imit_short', 'buy_imit_long', 'action'],
      dtype='object')

In [132]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [133]:
df.dropna(inplace=True)

In [134]:
df.columns

Index(['Unnamed:', 'open', 'high', 'ema-26', 'ema-12', 'low', 'mean-grad-hist',
       'close', 'volume', 'sma-25', 'long_jcrosk', 'short_kdj',
       'buy_imit_short', 'buy_imit_long', 'action'],
      dtype='object')

In [135]:
np.unique(df['short_kdj'])

array([0])

In [136]:
# Function to create sequences
def create_sequences(series, seq_length):
    print(series)
    documents = []
    labels = []
    for i in range(len(series) - seq_length):
        doc = ' '.join(map(str, series.iloc[i:i+seq_length]))
        documents.append(doc)
        labels.append(series.iloc[i+seq_length])
    return documents, labels

# Create sequences
seq_length = 3  # Length of each sequence 5 was used originally
documents, labels = create_sequences(df['action'], seq_length)

# Convert to DataFrame
sequence_df = pd.DataFrame({'document': documents, 'next_action': labels})

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    sequence_df['document'], sequence_df['next_action'], 
    test_size=0.25, shuffle=True, random_state=42
)

# Create a bag of words representation
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=3)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
clf = MultinomialNB(alpha=0.01, class_prior=[0.035, 0.035, 0.93]) # alpha=0.1, class_prior=[0.2, 0.3, 0.4], alpha=0.01, fit_prior=False
#clf = LogisticRegression(class_weight='balanced', max_iter=300)
#clf = RandomForestClassifier(class_weight='balanced', random_state=42)
#clf = DecisionTreeClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train_vec, y_train)

print(f'classes: {clf.classes_}')

# Make predictions
#print(f'xtest: {X_test_vec}')
y_pred = clf.predict(X_test_vec)

# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=True))

# Example of making a prediction
if len(df) >= seq_length:
    new_sequence = df['action'].iloc[-seq_length:].tolist()
    new_document = ' '.join(new_sequence)
    new_vector = vectorizer.transform([new_document])
    prediction = clf.predict(new_vector)

    print(f"\nGiven the sequence: {new_sequence}")
    print(f"The model predicts the next action will be: {prediction[0]}")
else:
    print("\nNot enough data to make a prediction.")


# Fixed Feature importance calculation
feature_names = vectorizer.get_feature_names_out()
feature_importance = np.exp(clf.feature_log_prob_)

for i, label in enumerate(clf.classes_):
    print(f"\nTop 5 features for predicting '{label}':")
    top_features = feature_importance[i].argsort()[::-1][:5]
    for idx in top_features:
        print(f"{feature_names[idx]}: {feature_importance[i][idx]:.3f}")

# Optional: Normalize feature importances
normalized_importance = feature_importance / feature_importance.sum(axis=1, keepdims=True)

print("\nNormalized Feature Importances:")
for i, label in enumerate(clf.classes_):
    print(f"\nTop 5 normalized features for predicting '{label}':")
    top_features = normalized_importance[i].argsort()[::-1][:5]
    for idx in top_features:
        print(f"{feature_names[idx]}: {normalized_importance[i][idx]:.3f}")


0       do_nothing
1       do_nothing
2       do_nothing
3         go_short
4       do_nothing
           ...    
2727    do_nothing
2728    do_nothing
2729    do_nothing
2730    do_nothing
2731    do_nothing
Name: action, Length: 2732, dtype: object
classes: ['do_nothing' 'go_long' 'go_short']
              precision    recall  f1-score   support

  do_nothing       0.97      0.97      0.97        33
     go_long       0.81      0.91      0.86        23
    go_short       1.00      0.99      0.99       627

    accuracy                           0.99       683
   macro avg       0.92      0.96      0.94       683
weighted avg       0.99      0.99      0.99       683


Given the sequence: ['do_nothing', 'do_nothing', 'do_nothing']
The model predicts the next action will be: do_nothing

Top 5 features for predicting 'do_nothing':
do_nothing: 0.570
do_nothing do_nothing: 0.366
go_short: 0.030
go_short do_nothing: 0.016
do_nothing go_short: 0.012

Top 5 features for predicting 'go_long':


In [137]:
try:
    joblib.dump(clf, '/home/defi/Desktop/portfolio/projects/python/pipeline_defi/models/nlpmodel.pkl')
    print('nlp model saved sucesfully!')
except Exception as e:
    print(e)

nlp model saved sucesfully!


In [138]:
try:
    joblib.dump(vectorizer, '/home/defi/Desktop/portfolio/projects/python/pipeline_defi/models/nlpvectorizer.pkl')
    print('nlp vectorizer save!')
except Exception as e:
    print(e)

nlp vectorizer save!


### Testing model explanability

In [139]:
#!pip show scikit-learn

##### Import libs

In [140]:
#from interpret.glassbox import ExplainableBoostingClassifier
#from interpret.blackbox import LimeTabular
#from interpret import show

In [141]:
#test_vec = list(X_test_vec)[1]

In [142]:
'''
for index, val in enumerate(list(X_test)):
    lst = list(val.split(' '))
    #if all(x == lst[0] for x in lst):
    if len(set(lst)) > 1:
        print(index, lst)
    #print()
    #    print(index, val)
'''

"\nfor index, val in enumerate(list(X_test)):\n    lst = list(val.split(' '))\n    #if all(x == lst[0] for x in lst):\n    if len(set(lst)) > 1:\n        print(index, lst)\n    #print()\n    #    print(index, val)\n"

In [143]:
#clf.predict(test_vec)[10]

##### Explain result

In [144]:
'''
# Step 1: Fit an Explainable Model (e.g., Explainable Boosting Machine)
def explain_pred(index: int):
    feature_names = list(X_test)[index]
    test_vec = list(X_test_vec)[index]
    ebm = ExplainableBoostingClassifier(
                                        interactions=0,
                                        smoothing_rounds=5000,
                                        cyclic_progress=0.0,
                                        reg_alpha=10.0
                                       )
    ebm.fit(X_train_vec, y_train)
    
    # Global explanation (feature importances)
    ebm_global = ebm.explain_global()
    
    show(ebm_global, 0)  # Shows which features are most important globally
    
    # Step 2: Local explanation for a specific prediction
    sample_sequence = test_vec  # Example sequence to explain
    ebm_local = ebm.explain_local([sample_sequence], [list(y_test)[index]])
    res = {
        'X': feature_names,
        'next_pred': clf.predict(test_vec)[0]
    }
    print(res)
    return show(ebm_local)  # Shows explanation for a specific prediction
'''

"\n# Step 1: Fit an Explainable Model (e.g., Explainable Boosting Machine)\ndef explain_pred(index: int):\n    feature_names = list(X_test)[index]\n    test_vec = list(X_test_vec)[index]\n    ebm = ExplainableBoostingClassifier(\n                                        interactions=0,\n                                        smoothing_rounds=5000,\n                                        cyclic_progress=0.0,\n                                        reg_alpha=10.0\n                                       )\n    ebm.fit(X_train_vec, y_train)\n    \n    # Global explanation (feature importances)\n    ebm_global = ebm.explain_global()\n    \n    show(ebm_global, 0)  # Shows which features are most important globally\n    \n    # Step 2: Local explanation for a specific prediction\n    sample_sequence = test_vec  # Example sequence to explain\n    ebm_local = ebm.explain_local([sample_sequence], [list(y_test)[index]])\n    res = {\n        'X': feature_names,\n        'next_pred': clf.predic

In [145]:
#explain_pred(41)