In [8]:
import pandas as pd
import random

# Load the data
train_df = pd.read_csv('./train.csv', sep='\t', quoting=3)
lexical_items_df = pd.read_csv('./lexical_items.csv', sep='\t', quoting=3, usecols=[0], names=['lexical_item'])

# Convert lexical items to a set for faster lookup
lexical_items = set(lexical_items_df['lexical_item'])

# Function to replace words with [MASK]
def replace_words(text, lexical_items, mask_all=True, mask_prob=0.3):
    if isinstance(text, str):
        words = text.split()
        for i, word in enumerate(words):
            if word in lexical_items:
                if mask_all or random.random() < mask_prob:
                    words[i] = '[MASK]'
        return ' '.join(words)

# Apply the function to the entire dataframe
train_df_all_masked = train_df.copy()
train_df_all_masked['tweet'] = train_df_all_masked['tweet'].apply(replace_words, lexical_items=lexical_items, mask_all=True)

train_df_partial_masked = train_df.copy()
train_df_partial_masked['tweet'] = train_df_partial_masked['tweet'].apply(replace_words, lexical_items=lexical_items, mask_all=False, mask_prob=0.3)

# Save the new dataframes to CSV files
train_df_all_masked.to_csv('./train_full_mask.csv', index=False, sep='\t', quoting=3)
train_df_partial_masked.to_csv('./train_30_mask.csv', index=False, sep='\t', quoting=3)