## Headers

In [1]:
# Data preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

 # Models
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import time

In [2]:
from utils import  train_val_split
from utils import  train_datapath, test_datapath

In [3]:
targets_for_test_df = pd.read_csv('data/targets_for_test.csv', index_col=0)['close']


def evaluate_model_performance(model, X_val_seq, y_val_seq):
    # Predict probabilities
    y_pred_prob = model.predict(X_val_seq)

    # Convert probabilities to binary predictions
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    y_val = y_val_seq.copy()
    filled_y_pred = y_pred.copy()
    if len(y_val_seq) == len(y_pred):       # If the lengths are the same, do nothing
        print('Lengths are the same')
        pass
    elif len(y_val_seq) > len(y_pred):      # If the target is longer than the prediction
        print('Target is longer than prediction')
        difference = len(y_val_seq) - len(y_pred)
        filled_y_pred = np.concatenate([np.zeros(difference), filled_y_pred])
    else:                                   # If the prediction is longer than the target 
        print('Prediction is longer than target')
        y_val = np.concatenate([np.zeros(1), y_val])
        
        
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, filled_y_pred)k
    print(f'Validation Accuracy: {accuracy:.5f}')

    # Calculate F1 macro score
    f1_macro = f1_score(y_val, filled_y_pred, average='macro')
    print(f'Validation F1 Macro Score: {f1_macro:.5f}')

    return y_pred, y_pred_prob


def save_submission(test_df, filled_test_predictions, filename='submission.csv'):
    filename = 'submissions/' + filename
    # Create a new DataFrame for the submission
    submission_df = pd.DataFrame({
        'row_id': test_df['row_id'],
        'target': [0] + filled_test_predictions
    })

    # Save the submission file
    submission_df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")





SyntaxError: invalid syntax (3204058389.py, line 27)

# Decision Tree

## Crude

In [None]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [None]:

start_time = time.time()


# Assuming 'target' is the column to predict and the rest are features
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# # Predict on the validation set
# y_pred = model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60

y_pred, ypred = evaluate_model_performance(model, X_val, y_val)

# print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
# print('--------------------------------------')
# # Calculate accuracy
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Validation Accuracy: {accuracy:.5f}')

# # Calculate F1 macro score
# f1_macro = f1_score(y_val, y_pred, average='macro')
# print(f'Validation F1 Macro Score: {f1_macro:.5f}')

Lengths are the same
Validation Accuracy: 0.50210
Validation F1 Macro Score: 0.50204


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
y_pred = model.predict(X_test)


y_pred, ypred = evaluate_model_performance(model, X_test, targets_for_test_df)

# Save the submission
save_submission(test_df, y_pred, 'crude_dt_submission.csv')



Prediction is longer than target
Validation Accuracy: 0.49465
Validation F1 Macro Score: 0.49460
Predictions saved to crude_dt_submission.csv


In [None]:
del train_df
del test_df

## SVDd data

In [None]:
svd_train_df = pd.read_csv('data/svd_train.csv')
svd_test_df = pd.read_csv('data/svd_test.csv')

In [None]:

start_time = time.time()


# Assuming 'target' is the column to predict and the rest are features
X = svd_train_df.drop(columns=['target'])
y = svd_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')

# # Predict on the validation set
# y_pred = model.predict(X_val)

evaluate_model_performance(model, X_val, y_val)

# # Calculate accuracy
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Validation Accuracy: {accuracy:.5f}')

# # Calculate F1 macro score
# f1_macro = f1_score(y_val, y_pred, average='macro')
# print(f'Validation F1 Macro Score: {f1_macro:.5f}')

Time elapsed: 1m 56.0s
--------------------------------------
Lengths are the same
Validation Accuracy: 0.49714
Validation F1 Macro Score: 0.49672


(array([0, 0, 1, ..., 0, 1, 1]), array([0., 0., 1., ..., 0., 1., 1.]))

In [None]:
svd_test_df.shape

(909529, 12)

In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = svd_test_df.drop(columns=['row_id'])

# Make predictions on the test data
# y_pred = model.predict(X_test)

# Save the submission
# save_submission(test_df, y_pred, 'crude_rnn_submission.csv')


y_pred, ypred = evaluate_model_performance(model, X_test, targets_for_test_df)
save_submission(svd_test_df, y_pred, 'svd_dt_submission.csv')

# # Calculate accuracy
# accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
# print(f'Test Accuracy: {accuracy:.5f}')

# # Calculate F1 macro score
# f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
# print(f'Test F1 Macro Score: {f1_macro:.5f}')

Target is longer than prediction
Validation Accuracy: 0.48812
Validation F1 Macro Score: 0.48473
Predictions saved to svd_dt_submission.csv


## Treated Dataframe

In [None]:
treated_train_df = pd.read_csv(train_datapath)
treated_test_df = pd.read_csv(test_datapath)

In [None]:


start_time = time.time()


# Assuming 'target' is the column to predict and the rest are features
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the validation set
# y_pred = model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')


y_pred, ypred = evaluate_model_performance(model, X_val, y_val)
# # Calculate accuracy
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Validation Accuracy: {accuracy:.5f}')

# # Calculate F1 macro score
# f1_macro = f1_score(y_val, y_pred, average='macro')
# print(f'Validation F1 Macro Score: {f1_macro:.5f}')



Time elapsed: 2m 0.1s
--------------------------------------
Lengths are the same
Validation Accuracy: 0.50210
Validation F1 Macro Score: 0.50204


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
# y_pred = model.predict(X_test)


y_pred, ypred = evaluate_model_performance(model, X_test, targets_for_test_df)
save_submission(treated_test_df, y_pred, 'treated_dt_submission.csv')



# # Calculate accuracy
# accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
# print(f'Test Accuracy: {accuracy:.5f}')

# # Calculate F1 macro score
# f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
# print(f'Test F1 Macro Score: {f1_macro:.5f}')

Prediction is longer than target
Validation Accuracy: 0.49465
Validation F1 Macro Score: 0.49460
Predictions saved to treated_dt_submission.csv


In [None]:
del treated_train_df
del treated_test_df

## Only New Features

In [74]:
new_features_train_df = pd.read_csv('data/new_features_train.csv')
new_features_test_df = pd.read_csv('data/new_features_test.csv')

In [75]:


start_time = time.time()


# Assuming 'target' is the column to predict and the rest are features
X = new_features_train_df.drop(columns=['target'])
y = new_features_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# # Predict on the validation set
# y_pred = model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')

evaluate_model_performance(model, X_val, y_val)

# # Calculate accuracy
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Validation Accuracy: {accuracy:.5f}')

# # Calculate F1 macro score
# f1_macro = f1_score(y_val, y_pred, average='macro')
# print(f'Validation F1 Macro Score: {f1_macro:.5f}')



Time elapsed: 2m 33.6s
--------------------------------------
Lengths are the same
Validation Accuracy: 0.50330
Validation F1 Macro Score: 0.50142


(array([1, 0, 0, ..., 1, 0, 1]), array([1., 0., 0., ..., 1., 0., 1.]))

In [77]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = new_features_test_df.drop(columns=['row_id'])

# # Make predictions on the test data
# y_pred = model.predict(X_test)

# Save the submission
y_prob, y_pred = evaluate_model_performance(model, X_test, targets_for_test_df)
save_submission(new_features_test_df, y_pred, 'only_nf_dt_submission.csv')


# # Calculate accuracy
# accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
# print(f'Test Accuracy: {accuracy:.5f}')

# # Calculate F1 macro score
# f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
# print(f'Test F1 Macro Score: {f1_macro:.5f}')

Target is longer than prediction
Validation Accuracy: 0.49609
Validation F1 Macro Score: 0.49069
Predictions saved to only_nf_dt_submission.csv


In [78]:
del new_features_train_df
del new_features_test_df

# Random Forest Classifier

## Crude

In [79]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [80]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()

# Assuming 'target' is the column to predict and the rest are features
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf_model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

KeyboardInterrupt: 

In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.45986
Test F1 Macro Score: 0.44456


In [None]:
del train_df
del test_df

## Treated Dataframe

In [None]:
treated_train_df = pd.read_csv('data/train.csv')
treated_test_df = pd.read_csv('data/test.csv')

In [None]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()

# Assuming 'target' is the column to predict and the rest are features
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf_model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

Time elapsed: 47m 59.0s
--------------------------------------
Validation Accuracy: 0.51910
Validation F1 Macro Score: 0.46335


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

ValueError: Found input variables with inconsistent numbers of samples: [909616, 909529]

In [None]:
del treated_train_df
del treated_test_df

## Only New Features

In [None]:
new_features_train_df = pd.read_csv('data/train.csv')
new_features_test_df = pd.read_csv('data/test.csv')

In [None]:


start_time = time.time()


# Assuming 'target' is the column to predict and the rest are features
X = new_features_train_df.drop(columns=['target'])
y = new_features_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf_model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')



NameError: name 'new_features_train_df' is not defined

In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = new_features_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(test_predictions)], test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

In [None]:
del new_features_train_df   
del new_features_test_df

# xGBoost

## Crude

In [None]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [None]:

start_time = time.time()

# Assuming 'target' is the column to predict and the rest are features
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the XGBoost model
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = xgb_model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

Parameters: { "use_label_encoder" } are not used.



Time elapsed: 0m 12.9s
--------------------------------------
Validation Accuracy: 0.52729
Validation F1 Macro Score: 0.44318


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.55726
Test F1 Macro Score: 0.44548


In [None]:
del train_df
del test_df

## New Features

In [None]:
treated_train_df = pd.read_csv('data/train.csv')
treated_test_df = pd.read_csv('data/test.csv')

In [None]:

start_time = time.time()

# Assuming 'target' is the column to predict and the rest are features
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the XGBoost model
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = xgb_model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

Parameters: { "use_label_encoder" } are not used.



Time elapsed: 0m 19.5s
--------------------------------------
Validation Accuracy: 0.53091
Validation F1 Macro Score: 0.48096


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.56668
Test F1 Macro Score: 0.44502


In [None]:
del treated_train_df
del treated_test_df

## Only New Features

In [None]:
new_features_train_df = pd.read_csv(train_datapath)
new_features_test_df = pd.read_csv(test_datapath)

In [None]:


start_time = time.time()


# Assuming 'target' is the column to predict and the rest are features
X = new_features_train_df.drop(columns=['target'])
y = new_features_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the XGBoost model
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = xgb_model.predict(X_val)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')



Parameters: { "use_label_encoder" } are not used.



Time elapsed: 0m 9.6s
--------------------------------------
Validation Accuracy: 0.51326
Validation F1 Macro Score: 0.48311


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = new_features_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.46491
Test F1 Macro Score: 0.46179


In [None]:
del new_features_train_df
del new_features_test_df

# Explainble Boost Machine

## Crude

In [None]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier

start_time = time.time()

# Assuming 'target' is the column to predict and the rest are features
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the Explainable Boosting Classifier model
ebm_model = ExplainableBoostingClassifier(random_state=42)
ebm_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = ebm_model.predict(X_val)
float_y_pred = y_pred.astype(float)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, float_y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, float_y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

Time elapsed: 17m 40.5s
--------------------------------------
Validation Accuracy: 0.52687
Validation F1 Macro Score: 0.42337


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = ebm_model.predict(X_test)
float_test_prediction = test_predictions.astype(float)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, float_test_prediction[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, float_test_prediction[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.55684
Test F1 Macro Score: 0.46666


In [None]:
from interpret import show


# Show the EBM explanation
ebm_global = ebm_model.explain_global()
show(ebm_global)
# Show local explanations
ebm_local = ebm_model.explain_local(X_val, y_val)
show(ebm_local)

NameError: name 'ebm_model' is not defined

In [None]:
del train_df
del test_df

## New Features

In [None]:
treated_train_df = pd.read_csv('data/train.csv')
treated_test_df = pd.read_csv('data/test.csv')

In [None]:

start_time = time.time()

# Assuming 'target' is the column to predict and the rest are features
X = treated_train_df.drop(columns=['target'])
y = treated_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the XGBoost model
ebm_model = ExplainableBoostingClassifier(random_state=42)
ebm_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = ebm_model.predict(X_val)
float_test_prediction = test_predictions.astype(float)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, float_y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, float_y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

  warn(


Time elapsed: 15m 6.6s
--------------------------------------
Validation Accuracy: 0.52670
Validation F1 Macro Score: 0.42182


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = treated_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = ebm_model.predict(X_test)
float_test_prediction = test_predictions.astype(float)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, float_test_prediction[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, float_test_prediction[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58042
Test F1 Macro Score: 0.39537


In [None]:
# Show the EBM explanation
ebm_global = ebm_model.explain_global()
show(ebm_global)
# Show local explanations
ebm_local = ebm_model.explain_local(X_val, y_val)
show(ebm_local)

In [None]:
del treated_train_df
del treated_test_df

## Only New Features

In [None]:
new_features_train_df = pd.read_csv('data/new_features_train.csv')
new_features_test_df = pd.read_csv('data/new_features_test.csv')

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier

start_time = time.time()


# Assuming 'target' is the column to predict and the rest are features
X = new_features_train_df.drop(columns=['target'])
y = new_features_train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Create and train the XGBoost model
ebm_model = ExplainableBoostingClassifier(random_state=42)
ebm_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = ebm_model.predict(X_val)
float_test_prediction = test_predictions.astype(float)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')



NameError: name 'test_predictions' is not defined

In [None]:
test_predictions = ebm_model.predict(X_val)
float_test_prediction = test_predictions.astype(float)

end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val, float_test_prediction)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, float_test_prediction, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')



Time elapsed: 13m 33.5s
--------------------------------------
Validation Accuracy: 0.52798
Validation F1 Macro Score: 0.47661


In [None]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = new_features_test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_predictions = ebm_model.predict(X_test)
float_test_prediction = test_predictions.astype(float)

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df[:len(float_test_prediction)], float_test_prediction[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df[:len(float_test_prediction)], float_test_prediction[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.53023
Test F1 Macro Score: 0.48283


In [None]:
# Show the EBM explanation
ebm_global = ebm_model.explain_global()
show(ebm_global)
# Show local explanations
ebm_local = ebm_model.explain_local(X_val, y_val)
show(ebm_local)

In [None]:
del new_features_train_df
del new_features_test_df