In [None]:
import os
import pandas as pd
import re
from tqdm import tqdm

def transform_hashtags(text):

    """Transforms hashtags into readable text."""
    hashtags = re.findall(r'#(\w+)', text)
    
    for hashtag in hashtags:
        words = hashtag.split('_')
        separated_words = []
        for word in words:
            if word.isupper():
                separated_words.append(word)
            else:
                split_words = re.findall('[A-Z][^A-Z]*', word)
                if split_words:
                    separated_words.extend(split_words)
                else:
                    separated_words.append(word)
        clean_text = ' '.join(separated_words).lower()
        text = text.replace(f'#{hashtag}', clean_text)
    return text
    
def clean_and_process_tweets(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True) #check folder 
    phrases_to_remove = [
        r'follow & rt to enter!?\.?', 
        r'rt & follow to enter!?\.?' 
    ]
    
    for file_name in tqdm(os.listdir(input_folder), desc='Cleaning and processing tweets', unit='files'):
        if file_name.endswith('.csv'):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)
            df = pd.read_csv(input_file_path)
            duplicate_rows = df[df.duplicated()]
            df = df.drop_duplicates() # drop duplicates 
            
            if 'Tweet' in df.columns:
                df = df.drop_duplicates(subset='Tweet') # drop duplicates in Tweets  
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+|htt…', '', str(x), flags=re.MULTILINE)) # remove links 
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'^RT\s+@\w+:\s+', '', str(x))) # remove RT and usernames after that
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'@\w+', 'user', str(x)))
                df['Tweet'] = df['Tweet'].apply(transform_hashtags) # transform hashtags 
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'#', '', str(x))) # remove leftover hashtags
                df['Tweet'] = df['Tweet'].apply(lambda x: str(x).lower()) # convert to lower case
                for phrase in phrases_to_remove:
                    df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(phrase, '', str(x), flags=re.IGNORECASE)) # delete "follow & rt to enter!" or "rt & follow to enter."
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'^:\s*', '', str(x))) # delete colons from beginning of rows
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'\s+', ' ', str(x)))  # remove extra spaces
                df['Tweet'] = df['Tweet'].apply(lambda x: x.strip()) # remove leading and trailing spaces
                df['Tweet'] = df['Tweet'].apply(lambda x: x.replace('\n', ' ')) # replace newlines with spaces
            
            df.to_csv(output_file_path, index=False) # save dataset 


In [15]:
clean_and_process_tweets('challenge_data/train_tweets', 'cleaned_data/train_data')

Cleaning and processing tweets: 100%|██████████| 16/16 [00:35<00:00,  2.25s/files]
