In [53]:
import os
import pandas as pd
import re


def transform_hashtags(text):
    """Transforms hashtags into readable text."""
    hashtags = re.findall(r'#(\w+)', text)
    
    for hashtag in hashtags:
        words = hashtag.split('_')
        separated_words = []
        for word in words:
            split_words = re.findall('[A-Z][^A-Z]*', word)
            if split_words:
                separated_words.extend(split_words)
            else:
                separated_words.append(word)
        clean_text = ' '.join(separated_words).lower()
        text = text.replace(f'#{hashtag}', clean_text)
    return text
    
def clean_and_process_tweets(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True) #check folder 
    phrases_to_remove = [
        r'follow & rt to enter!?\.?', 
        r'rt & follow to enter!?\.?' 
    ]
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.csv'):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)
            df = pd.read_csv(input_file_path)
            duplicate_rows = df[df.duplicated()]
            df = df.drop_duplicates() #1. Drop duplicates 
            
            if 'Tweet' in df.columns:
                duplicate_tweets = df[df.duplicated(subset='Tweet', keep=False)]
                df = df.drop_duplicates(subset='Tweet') #2. Drop duplicates in Tweets 
                df['Tweet'] = df['Tweet'].apply(lambda x: str(x).lower()) #3 convert to lower case 
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+|htt…', '', str(x), flags=re.MULTILINE)) #4. Remove links 
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'^rt\s+', '', str(x))) #5. Remove RT from beginning of rows 
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'@\w+', '', str(x))) #6. Remove @usernames
                df['Tweet'] = df['Tweet'].apply(transform_hashtags) #7. transform hashtags 
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'#', '', str(x))) #8. remove leftover hashtags
                for phrase in phrases_to_remove:
                    df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(phrase, '', str(x), flags=re.IGNORECASE)) # delete "follow & rt to enter!" or "rt & follow to enter."
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'^:\s*', '', str(x))) #9. Delete colons from beginning of rows 
            
            df.to_csv(output_file_path, index=False) # Save dataset 

input_folder = 'input folder'
output_folder = 'cleaned folder'
clean_and_process_tweets(input_folder, output_folder)
