This notebook will execute the pre-processing steps by removing noises in text. The clean version will be saved for 1) lemmatization for TF-IDF, 2) tokenizing for FinBERT.

# Import packages

In [3]:
import os
import warnings
import pandas as pd
import spacy
from gensim.parsing.preprocessing import STOPWORDS, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short

# Load data

In [2]:
# Get all training data files
train_path = '../../data/02_processed/distilbert_data/train/'
train_files = [file for file in os.listdir(train_path) if file.endswith('train.csv')]
train_files

['RevenueFromContractWithCustomerExcludingAssessedTax_train.csv',
 'NetCashProvidedByUsedInInvestingActivities_train.csv',
 'NetCashProvidedByUsedInFinancingActivities_train.csv',
 'EBIT_train.csv',
 'NetIncomeLoss_train.csv',
 'EarningsPerShareDiluted_train.csv']

In [4]:
# Get all test data files
test_path = '.../../data/02_processed/distilbert_data/test/'
test_files = [file for file in os.listdir(test_path) if file.endswith('test.csv')]
test_files

['EBIT_test.csv',
 'NetCashProvidedByUsedInFinancingActivities_test.csv',
 'RevenueFromContractWithCustomerExcludingAssessedTax_test.csv',
 'EarningsPerShareDiluted_test.csv',
 'NetIncomeLoss_test.csv',
 'NetCashProvidedByUsedInInvestingActivities_test.csv']

In [4]:
# Get all training data files
# fb = finbert
fb_train_path = '../../data/02_processed/finbert_data/train/'
fb_train_files = [file for file in os.listdir(fb_train_path) if file.endswith('train.csv')]
fb_train_files

['RevenueFromContractWithCustomerExcludingAssessedTax_train.csv',
 'NetCashProvidedByUsedInInvestingActivities_train.csv',
 'NetCashProvidedByUsedInFinancingActivities_train.csv',
 'EBIT_train.csv',
 'NetIncomeLoss_train.csv',
 'EarningsPerShareDiluted_train.csv']

In [5]:
# Get all test data files
fb_test_path = '../../data/02_processed/finbert_data/test/'
fb_test_files = [file for file in os.listdir(fb_test_path) if file.endswith('test.csv')]
fb_test_files

['EBIT_test.csv',
 'NetCashProvidedByUsedInFinancingActivities_test.csv',
 'RevenueFromContractWithCustomerExcludingAssessedTax_test.csv',
 'EarningsPerShareDiluted_test.csv',
 'NetIncomeLoss_test.csv',
 'NetCashProvidedByUsedInInvestingActivities_test.csv']

# Preprocess text

In [6]:
# Suppress warnings (i.e SettingWithCopyWarning)
warnings.filterwarnings('ignore')
    
# Function to preprocess and lemmatize text
def preprocess_text(df, target_column):
    
    # Remove punctuation
    df['text'] = df['text'].apply(lambda x: strip_punctuation(x))

    # Remove multiple whitespaces
    df['text'] = df['text'].apply(lambda x: strip_multiple_whitespaces(x))

    # Transform to lowercase
    df['text'] = df['text'].apply(lambda x: x.lower())

    # Remove stopwords
    df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

    # Remove short words
    df['text'] = df['text'].apply(lambda x: strip_short(x))

    
    # Prepare data for training
    df = df[['text', target_column]]
    df.columns = ['text', 'target'] # Rename columns
    
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    # Drop rows with missing values
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df


In [6]:
# Preprocess all training and test data
for file in train_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(train_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())
    
    print("-----------------------------------------")

    # Save preprocessed data
    df.to_csv(train_path + file.split('.')[0] + "__text_clean.csv" , index=False)
    
print("-------------------------// DONE WITH TRAINING DATA //-------------------------")

for file in test_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(test_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())

    # Save preprocessed data
    df.to_csv(test_path + file.split('.')[0] + "__text_clean.csv" , index=False)

print("-------------------------// DONE WITH TEST DATA //-------------------------")

Preprocessing RevenueFromContractWithCustomerExcludingAssessedTax_train.csv...


                                                text       target
0  management considers factors assessing expecte...   5429000000
1  estimate fair values assets acquired acquisiti...   4147270000
2  actual future taxable income differs estimates...   7651319000
3  company expects subject incremental tax gilti ...   6820886000
4  accumulated unrepatriated indian earnings repa...  16783000000
-----------------------------------------
Preprocessing NetCashProvidedByUsedInInvestingActivities_train.csv...
                                                text        target
0  fourth quarter 2018 expanded restructuring pla...  3.100000e+07
1  given quality diversity underlying real estate... -7.340000e+08
2  cash flow amounts reduced realized gains inves... -6.240000e+08
3  plan expand network tesla stores galleries ser... -1.081085e+09
4  given quality diversity underlying real estate... -3.611000e+09
-----------------------------------------
Preprocessing NetCashProvidedByUsedInFinancingAc

In [8]:
# Preprocess all training and test data
for file in fb_train_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(fb_train_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())
    
    print("-----------------------------------------")

    # Save preprocessed data
    df.to_csv(fb_train_path + file.split('.')[0] + "__text_clean.csv" , index=False)
    
print("-------------------------// DONE WITH TRAINING DATA //-------------------------")

for file in fb_test_files:
    print(f'Preprocessing {file}...')
    df = pd.read_csv(os.path.join(fb_test_path, file))
    df = preprocess_text(df, 'val')
    print(df.head())

    # Save preprocessed data
    df.to_csv(fb_test_path + file.split('.')[0] + "__text_clean.csv" , index=False)

print("-------------------------// DONE WITH TEST DATA //-------------------------")

Preprocessing RevenueFromContractWithCustomerExcludingAssessedTax_train.csv...
                                                text       target
0  total net proceeds approximately 089 million i...   4147270000
1  regularly review recoverability deferred tax a...   2408200000
2  given period change deferred gain included net...   1360000000
3  income approach estimate projected future cash...  16783000000
4  income approach based projected future cash fl...    900465000
-----------------------------------------
Preprocessing NetCashProvidedByUsedInInvestingActivities_train.csv...
                                                text        target
0  account contracted condominium sales percentag... -8.379220e+08
1  cash flow amounts reduced realized gains inves... -6.240000e+08
2  cash flow amounts reduced realized gains inves... -5.580000e+08
3  company extensive non operations significant c... -1.612700e+09
4  cash flow amounts reduced realized gains inves... -4.560000e+08
-----------