This notebook will execute the pre-processing steps by removing noises in text. The clean version will be saved for 1) lemmatization for TF-IDF, 2) tokenizing for FinBERT.

# Import packages

In [3]:
# General libraries
import os
import time
import multiprocessing
import warnings
import pickle
import re

# Data handling and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Natural Language Processing (NLP)
import spacy
from gensim.parsing.preprocessing import STOPWORDS, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short
from gensim.corpora import Dictionary
from gensim.models import TfidfModel#, Word2Vec
#from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
#from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
#from sklearn.compose import ColumnTransformer
from sklearn.exceptions import FitFailedWarning
from sklearn.metrics import make_scorer
from tqdm import tqdm

# Load data

In [2]:
# Get all training data files
train_path = '../datasets/distilbert_data/train/'
train_files = [file for file in os.listdir(train_path) if file.endswith('train.csv')]
train_files

['RevenueFromContractWithCustomerExcludingAssessedTax_train.csv',
 'NetCashProvidedByUsedInInvestingActivities_train.csv',
 'NetCashProvidedByUsedInFinancingActivities_train.csv',
 'EBIT_train.csv',
 'NetIncomeLoss_train.csv',
 'EarningsPerShareDiluted_train.csv']

In [4]:
# Get all test data files
test_path = '../datasets/distilbert_data/test/'
test_files = [file for file in os.listdir(test_path) if file.endswith('test.csv')]
test_files

['EBIT_test.csv',
 'NetCashProvidedByUsedInFinancingActivities_test.csv',
 'RevenueFromContractWithCustomerExcludingAssessedTax_test.csv',
 'EarningsPerShareDiluted_test.csv',
 'NetIncomeLoss_test.csv',
 'NetCashProvidedByUsedInInvestingActivities_test.csv']

In [4]:
# Get all training data files
# fb = finbert
fb_train_path = '../datasets/finbert_data/train/'
fb_train_files = [file for file in os.listdir(fb_train_path) if file.endswith('train.csv')]
fb_train_files

['RevenueFromContractWithCustomerExcludingAssessedTax_train.csv',
 'NetCashProvidedByUsedInInvestingActivities_train.csv',
 'NetCashProvidedByUsedInFinancingActivities_train.csv',
 'EBIT_train.csv',
 'NetIncomeLoss_train.csv',
 'EarningsPerShareDiluted_train.csv']

In [5]:
# Get all test data files
fb_test_path = '../datasets/finbert_data/test/'
fb_test_files = [file for file in os.listdir(fb_test_path) if file.endswith('test.csv')]
fb_test_files

['EBIT_test.csv',
 'NetCashProvidedByUsedInFinancingActivities_test.csv',
 'RevenueFromContractWithCustomerExcludingAssessedTax_test.csv',
 'EarningsPerShareDiluted_test.csv',
 'NetIncomeLoss_test.csv',
 'NetCashProvidedByUsedInInvestingActivities_test.csv']

# Preprocess text

In [6]:
# Suppress warnings (i.e SettingWithCopyWarning)
warnings.filterwarnings('ignore')
    
# Function to preprocess and lemmatize text
def preprocess_text(df, target_column):
    
    # Remove punctuation
    df['text'] = df['text'].apply(lambda x: strip_punctuation(x))

    # Remove multiple whitespaces
    df['text'] = df['text'].apply(lambda x: strip_multiple_whitespaces(x))

    # Transform to lowercase
    df['text'] = df['text'].apply(lambda x: x.lower())

    # Remove stopwords
    df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

    # Remove short words
    df['text'] = df['text'].apply(lambda x: strip_short(x))

    
    # Prepare data for training
    df = df[['text', target_column]]
    df.columns = ['text', 'target'] # Rename columns
    
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    # Drop rows with missing values
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df


In [6]:
# Preprocess all training and test data
for file in train_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(train_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())
    
    print("-----------------------------------------")

    # Save preprocessed data
    df.to_csv(train_path + file.split('.')[0] + "__text_clean.csv" , index=False)
    
print("-------------------------// DONE WITH TRAINING DATA //-------------------------")

for file in test_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(test_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())

    # Save preprocessed data
    df.to_csv(test_path + file.split('.')[0] + "__text_clean.csv" , index=False)

print("-------------------------// DONE WITH TEST DATA //-------------------------")

Preprocessing RevenueFromContractWithCustomerExcludingAssessedTax_train.csv...


                                                text       target
0  management considers factors assessing expecte...   5429000000
1  estimate fair values assets acquired acquisiti...   4147270000
2  actual future taxable income differs estimates...   7651319000
3  company expects subject incremental tax gilti ...   6820886000
4  accumulated unrepatriated indian earnings repa...  16783000000
-----------------------------------------
Preprocessing NetCashProvidedByUsedInInvestingActivities_train.csv...
                                                text        target
0  fourth quarter 2018 expanded restructuring pla...  3.100000e+07
1  given quality diversity underlying real estate... -7.340000e+08
2  cash flow amounts reduced realized gains inves... -6.240000e+08
3  plan expand network tesla stores galleries ser... -1.081085e+09
4  given quality diversity underlying real estate... -3.611000e+09
-----------------------------------------
Preprocessing NetCashProvidedByUsedInFinancingAc

In [8]:
# Preprocess all training and test data
for file in fb_train_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(fb_train_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())
    
    print("-----------------------------------------")

    # Save preprocessed data
    df.to_csv(fb_train_path + file.split('.')[0] + "__text_clean.csv" , index=False)
    
print("-------------------------// DONE WITH TRAINING DATA //-------------------------")

for file in fb_test_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(fb_test_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())

    # Save preprocessed data
    df.to_csv(fb_test_path + file.split('.')[0] + "__text_clean.csv" , index=False)

print("-------------------------// DONE WITH TEST DATA //-------------------------")

Preprocessing RevenueFromContractWithCustomerExcludingAssessedTax_train.csv...
                                                text       target
0  total net proceeds approximately 089 million i...   4147270000
1  regularly review recoverability deferred tax a...   2408200000
2  given period change deferred gain included net...   1360000000
3  income approach estimate projected future cash...  16783000000
4  income approach based projected future cash fl...    900465000
-----------------------------------------
Preprocessing NetCashProvidedByUsedInInvestingActivities_train.csv...
                                                text        target
0  account contracted condominium sales percentag... -8.379220e+08
1  cash flow amounts reduced realized gains inves... -6.240000e+08
2  cash flow amounts reduced realized gains inves... -5.580000e+08
3  company extensive non operations significant c... -1.612700e+09
4  cash flow amounts reduced realized gains inves... -4.560000e+08
-----------

# Train random forest regressors

['NetIncomeLoss_train_preprocessed.csv',
 'RevenueFromContractWithCustomerExcludingAssessedTax_train_preprocessed.csv',
 'NetCashProvidedByUsedInFinancingActivities_train_preprocessed.csv',
 'NetCashProvidedByUsedInInvestingActivities_train_preprocessed.csv',
 'EarningsPerShareDiluted_train_preprocessed.csv',
 'EBIT_train_preprocessed.csv']

--------------START TRAINING MODEL FOR NetIncomeLoss--------------


Grid Search Progress - TF-IDF:   0%|          | 0/243 [00:00<?, ?it/s]

Grid Search Progress - TF-IDF:   0%|          | 1/243 [12:36<50:51:46, 756.64s/it]


Best parameters for TF-IDF: {'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 10, 'rf__n_estimators': 500}
Best negative RSE score for TF-IDF: -0.6334869727396837
Time taken for TF-IDF grid search: 756.6324696540833 seconds
--------------DONE TRAINING MODEL FOR NetIncomeLoss--------------


--------------START TRAINING MODEL FOR RevenueFromContractWithCustomerExcludingAssessedTax--------------


Grid Search Progress - TF-IDF:   0%|          | 1/243 [08:03<32:28:28, 483.09s/it]


Best parameters for TF-IDF: {'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 500}
Best negative RSE score for TF-IDF: -0.4792336024566242
Time taken for TF-IDF grid search: 483.0873520374298 seconds
--------------DONE TRAINING MODEL FOR RevenueFromContractWithCustomerExcludingAssessedTax--------------


--------------START TRAINING MODEL FOR NetCashProvidedByUsedInFinancingActivities--------------


Grid Search Progress - TF-IDF:   0%|          | 1/243 [00:15<1:04:04, 15.89s/it]


Best parameters for TF-IDF: {'rf__max_depth': None, 'rf__max_features': 1.0, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
Best negative RSE score for TF-IDF: -inf
Time taken for TF-IDF grid search: 15.88652753829956 seconds
--------------DONE TRAINING MODEL FOR NetCashProvidedByUsedInFinancingActivities--------------


--------------START TRAINING MODEL FOR NetCashProvidedByUsedInInvestingActivities--------------


Grid Search Progress - TF-IDF:   0%|          | 1/243 [00:16<1:07:45, 16.80s/it]


Best parameters for TF-IDF: {'rf__max_depth': 8, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
Best negative RSE score for TF-IDF: -1.375560567298534
Time taken for TF-IDF grid search: 16.797523975372314 seconds
--------------DONE TRAINING MODEL FOR NetCashProvidedByUsedInInvestingActivities--------------


--------------START TRAINING MODEL FOR EarningsPerShareDiluted--------------


Grid Search Progress - TF-IDF:   0%|          | 1/243 [00:33<2:14:14, 33.28s/it]


Best parameters for TF-IDF: {'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 10, 'rf__n_estimators': 200}
Best negative RSE score for TF-IDF: -0.8818554951887793
Time taken for TF-IDF grid search: 33.27862811088562 seconds
--------------DONE TRAINING MODEL FOR EarningsPerShareDiluted--------------


--------------START TRAINING MODEL FOR EBIT--------------


Grid Search Progress - TF-IDF:   0%|          | 1/243 [00:22<1:29:02, 22.08s/it]

Best parameters for TF-IDF: {'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
Best negative RSE score for TF-IDF: -0.7814153522870717
Time taken for TF-IDF grid search: 22.075989484786987 seconds
--------------DONE TRAINING MODEL FOR EBIT--------------







# Obmitted

In [None]:
 # Define the RSE scoring function
def rse_scorer(y_true, y_pred):
    true_mean = np.mean(y_true)
    squared_error_num = np.sum(np.square(y_true - y_pred))
    squared_error_den = np.sum(np.square(y_true - true_mean))
    rse_loss = squared_error_num / squared_error_den
    return -rse_loss  # Note the negative sign since GridSearchCV maximizes the score


# Custom transformer for Doc2Vec
class Doc2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=None, workers=None):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers if workers is not None else multiprocessing.cpu_count()

    def fit(self, X, y=None):
        self.documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(X)]
        
        # Set min_count to 5% of the document count if not explicitly provided
        if self.min_count is None:
            self.min_count = round(len(self.documents) * 0.05)
            
        self.model = Doc2Vec(vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
        self.model.build_vocab(self.documents)
        self.model.train(self.documents, total_examples=self.model.corpus_count, epochs=self.model.epochs)
        return self

    def transform(self, X):
        return np.array([self.model.infer_vector(doc.split()) for doc in X])



# Function to train a random forest regressor on the preprocessed data
def train_random_forest_regressor(df, target_column, random_state=42):
    # Split the data into training and testing sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=random_state)

    # Define the pipeline
    pipeline_tfidf = Pipeline([
        ('tfidf', TfidfVectorizer(max_df=0.95, min_df=round(len(train_df) * 0.05))),
        ('rf', RandomForestRegressor(random_state=random_state))
    ])

    pipeline_doc2vec = Pipeline([
        ('doc2vec', Doc2VecTransformer()),
        ('rf', RandomForestRegressor(random_state=random_state))
    ])

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'rf__n_estimators': [100, 200, 500],
        'rf__max_features': [1.0, 'sqrt', 'log2'],
        'rf__max_depth': [None, 4, 8],
        'rf__min_samples_split': [2, 10, 20],
        'rf__min_samples_leaf': [1, 5, 10]
    }
    
    # Define the vector sizes to try for Doc2Vec
    doc2vec_vector_sizes = [50, 100, 200]
    
    # Add timer to record time
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FitFailedWarning)
        warnings.filterwarnings("ignore", category=UserWarning, message="One or more of the test scores are non-finite")

        # Perform grid search for TF-IDF
        with tqdm(total=len(param_grid['rf__n_estimators']) * len(param_grid['rf__max_features']) *
                  len(param_grid['rf__max_depth']) * len(param_grid['rf__min_samples_split']) *
                  len(param_grid['rf__min_samples_leaf']), desc='Grid Search Progress - TF-IDF') as pbar:

            start_time_tfidf = time.time()  # Record the start time

            grid_search_tfidf = GridSearchCV(pipeline_tfidf, param_grid, cv=5, scoring=make_scorer(rse_scorer), n_jobs=-1)
            grid_search_tfidf.fit(train_df['text'], train_df[target_column])

            end_time_tfidf = time.time()  # Record the end time
            elapsed_time_tfidf = end_time_tfidf - start_time_tfidf  # Calculate the elapsed time

            pbar.update(1)  # Update the progress bar

        # Access the best parameters and performance metrics for TF-IDF
        print(f"Best parameters for TF-IDF: {grid_search_tfidf.best_params_}")
        print(f"Best negative RSE score for TF-IDF: {grid_search_tfidf.best_score_}")
        print(f"Time taken for TF-IDF grid search: {elapsed_time_tfidf} seconds")
        print("------------------------------------------------------------")

        # Perform grid search for Doc2Vec with different vector sizes
        best_doc2vec_vector_size = None
        best_doc2vec_score = float('-inf')  # Initialize with negative infinity

        start_time_doc2vec = time.time()  # Record the start time
        for vector_size in tqdm(doc2vec_vector_sizes, desc='Vector Size Progress - Doc2Vec'):
            pipeline_doc2vec.set_params(doc2vec__vector_size=vector_size)
            grid_search_doc2vec = GridSearchCV(pipeline_doc2vec, param_grid, cv=5, scoring=make_scorer(rse_scorer), n_jobs=-1)
            grid_search_doc2vec.fit(train_df['text'], train_df[target_column])

            # Check if the current vector size has a better score than the previous best
            if grid_search_doc2vec.best_score_ > best_doc2vec_score:
                best_doc2vec_score = grid_search_doc2vec.best_score_
                best_doc2vec_vector_size = vector_size
                best_doc2vec_params = grid_search_doc2vec.best_params_
                best_doc2vec_model = grid_search_doc2vec
                
        end_time_doc2vec = time.time()
        elapsed_time_doc2vec = end_time_doc2vec - start_time_doc2vec
        
        
        print(f"Best parameters for Doc2Vec (vector_size={best_doc2vec_vector_size}): {best_doc2vec_params}")
        print(f"Best negative RSE score for Doc2Vec: {best_doc2vec_score} (vector_size={best_doc2vec_vector_size})")
        print(f"Time taken for Doc2Vec grid search: {elapsed_time_doc2vec} seconds")
        print("------------------------------------------------------------")


    # Evaluate the models on the test set
    y_pred_tfidf = grid_search_tfidf.predict(test_df['text'])
    y_pred_d2v = grid_search_doc2vec.predict(test_df['text'])
    # Function to calculate the relative squared error
    # Pros of RSE:
    # - It is scale independent --> can be used to compare models with different scales
    # - It is symmetric
    # - It is easy to interpret --> below 1 means that the model is better than the baseline, above 1 means that the model is worse than the baseline
    def rse(y_true, y_pred):
        return -rse_scorer(y_true, y_pred)  # Reverse the sign of the RSE score to get the RSE
    
    # Train RSE
    train_tfidf_score = -1 * grid_search_tfidf.best_score_
    train_doc2vec_score = -1 * best_doc2vec_score
    # Test RSE
    test_tfidf_score = rse(test_df[target_column], y_pred_tfidf)
    test_doc2vec_score = rse(test_df[target_column], y_pred_d2v)
    
    # Create a dataframe with the results
    results = pd.DataFrame({'Model': ['TFIDF', 'Doc2Vec'], 
                            'Train RSE': [train_tfidf_score, train_doc2vec_score],
                            'Test RSE': [test_tfidf_score, test_doc2vec_score]
                            })
    print(results)
    return grid_search_tfidf, best_doc2vec_model
        

## Evaluate with test data

In [None]:
# Evaluate the models on the test set
    y_pred_tfidf = grid_search_tfidf.predict(test_df['text'])
    y_pred_d2v = grid_search_doc2vec.predict(test_df['text'])
    # Function to calculate the relative squared error
    # Pros of RSE:
    # - It is scale independent --> can be used to compare models with different scales
    # - It is symmetric
    # - It is easy to interpret --> below 1 means that the model is better than the baseline, above 1 means that the model is worse than the baseline
    def rse(y_true, y_pred):
        return -rse_scorer(y_true, y_pred)  # Reverse the sign of the RSE score to get the RSE
    
    # Train RSE
    train_tfidf_score = -1 * grid_search_tfidf.best_score_
    train_doc2vec_score = -1 * best_doc2vec_score
    # Test RSE
    test_tfidf_score = rse(test_df[target_column], y_pred_tfidf)
    test_doc2vec_score = rse(test_df[target_column], y_pred_d2v)
    
    # Create a dataframe with the results
    results = pd.DataFrame({'Model': ['TFIDF', 'Doc2Vec'], 
                            'Train RSE': [train_tfidf_score, train_doc2vec_score],
                            'Test RSE': [test_tfidf_score, test_doc2vec_score]
                            })
    print(results)