In [None]:
from textblob import TextBlob
import symspellpy
from symspellpy import SymSpell, Verbosity
import os
import pandas as pd
import re
from tqdm import tqdm
import emoji
from sentence_transformers import SentenceTransformer

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "dictionary_path" 
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

slang_dict = {
    "ur": "your",
    "lol": "laughing out loud",
    "brb": "be right back",
    "idk": "I don't know",
    "smh": "shaking my head",
    "ht": "half time", 
    "ft": "full time"
}

def expand_slang(text):
    """Expands common slang and acronyms."""
    words = text.split()
    expanded_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(expanded_words)

def correct_spelling(text):
    """Corrects spelling using SymSpell."""
    corrected_text = []
    for word in text.split():
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected_text.append(suggestions[0].term)
        else:
            corrected_text.append(word)
    return ' '.join(corrected_text)

def remove_emojis(text):
    """Removes emojis from text."""
    return emoji.replace_emoji(text, replace="")  # Replaces emojis with an empty string

def normalize_repeated_characters(text):
    """Normalizes excessive punctuation and repeated characters."""
    return re.sub(r'(.)\1{2,}', r'\1', text)

def transform_hashtags(text):
    """Transforms hashtags into readable text."""
    hashtags = re.findall(r'#(\w+)', text)
    for hashtag in hashtags:
        words = hashtag.split('_')
        separated_words = []
        for word in words:
            if word.isupper():
                separated_words.append(word)
            else:
                split_words = re.findall('[A-Z][^A-Z]*', word)
                if split_words:
                    separated_words.extend(split_words)
                else:
                    separated_words.append(word)
        clean_text = ' '.join(separated_words).lower()
        text = text.replace(f'#{hashtag}', clean_text)
    return text

def clean_and_process_tweets(input_folder, output_folder):
    """Preprocess tweets in all CSV files in the input folder."""
    os.makedirs(output_folder, exist_ok=True)
    phrases_to_remove = [
        r'follow & rt to enter!?\.?', 
        r'rt & follow to enter!?\.?' 
    ]
    
    for file_name in tqdm(os.listdir(input_folder), desc='Cleaning and processing tweets', unit='files'):
        if file_name.endswith('.csv'):
            input_file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)
            df = pd.read_csv(input_file_path)
            df = df.drop_duplicates()  # Drop duplicate rows
            
            if 'Tweet' in df.columns:
                df = df.drop_duplicates(subset='Tweet')  # Drop duplicate tweets
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+|htt…', '', str(x)))  # Remove links
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'^RT\s+@\w+:\s+', '', str(x)))  # Remove RT
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'@\w+', 'user', str(x)))  # Replace @usernames
                df['Tweet'] = df['Tweet'].apply(transform_hashtags)  # Transform hashtags
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'#', '', str(x)))  # Remove leftover hashtags
                df['Tweet'] = df['Tweet'].apply(lambda x: str(x).lower())  # Convert to lowercase
                df['Tweet'] = df['Tweet'].apply(remove_emojis)  # Remove emojis
                df['Tweet'] = df['Tweet'].apply(normalize_repeated_characters)  # Normalize repeated characters
                df['Tweet'] = df['Tweet'].apply(expand_slang)  # Expand slang
                df['Tweet'] = df['Tweet'].apply(correct_spelling)  # Correct spelling
                for phrase in phrases_to_remove:
                    df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(phrase, '', str(x), flags=re.IGNORECASE))  # Remove phrases
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'^:\s*', '', str(x)))  # Remove leading colons
                df['Tweet'] = df['Tweet'].apply(lambda x: re.sub(r'\s+', ' ', str(x)))  # Remove extra spaces
                df['Tweet'] = df['Tweet'].apply(lambda x: x.strip())  # Remove leading/trailing spaces
                df['Tweet'] = df['Tweet'].apply(lambda x: x.replace('\n', ' '))  # Replace newlines with spaces
            
            df.to_csv(output_file_path, index=False)  # Save preprocessed data

In [None]:
train_input_folder = "input train"
eval_input_folder = "eval train"
train_output_folder = "train output"
eval_output_folder = "eval output"
print("Preprocessing train and eval data...")
clean_and_process_tweets(train_input_folder, train_output_folder)
clean_and_process_tweets(eval_input_folder, eval_output_folder)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')  

def generate_embeddings(data, column_name="Tweet"):
    """
    Generate embeddings for the specified column in a DataFrame.

    Args:
        data (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the column containing text data.

    Returns:
        pd.DataFrame: The DataFrame with a new 'Embedding' column containing embeddings.
    """
    print(f"Generating embeddings for {column_name}...")
    data['Embedding'] = list(bert_model.encode(data[column_name].tolist(), show_progress_bar=True))
    return data

def load_csv_folder(folder_path):
    """
    Load and combine all CSV files from a given folder into a single DataFrame.

    Args:
        folder_path (str): Path to the folder containing CSV files.

    Returns:
        pd.DataFrame: Combined DataFrame.
    """
    dataframes = []
    for filename in tqdm(os.listdir(folder_path), desc=f"Loading files from {folder_path}"):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

train_data = load_csv_folder("/Users/jamilya/Desktop/challenge_data_neq/train_bert")
eval_data = load_csv_folder("/Users/jamilya/Desktop/challenge_data_neq/eval_tweets_bert")

train_data = generate_embeddings(train_data, column_name="Tweet")
eval_data = generate_embeddings(eval_data, column_name="Tweet")

In [None]:
import numpy as np
def aggregate_embeddings(data, target_exists=True):
    """
    Aggregate tweet embeddings by ID. If target_exists is False, skip aggregating EventType.
    """
    if target_exists:
        aggregated_data = data.groupby('ID').agg({
            'Embedding': lambda x: np.mean(np.vstack(x), axis=0),  # Mean of embeddings for each ID
            'EventType': 'first'  
        }).reset_index()
    else:
        aggregated_data = data.groupby('ID').agg({
            'Embedding': lambda x: np.mean(np.vstack(x), axis=0)  
        }).reset_index()
    return aggregated_data

print("Aggregating embeddings by ID...")
train_data['Embedding'] = train_data['Embedding'].apply(np.array)
eval_data['Embedding'] = eval_data['Embedding'].apply(np.array)

train_aggregated = aggregate_embeddings(train_data, target_exists=True)
eval_aggregated = aggregate_embeddings(eval_data, target_exists=False)

In [None]:
X = pd.DataFrame(train_aggregated['Embedding'].tolist())
y = train_aggregated['EventType']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_eval = pd.DataFrame(eval_aggregated['Embedding'].tolist())
print("Training LightGBM model...")
lgb_model = LGBMClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
lgb_model.fit(X_train, y_train)

y_val_pred = lgb_model.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {validation_accuracy:.4f}")

print("Predicting EventType for evaluation data...")
eval_aggregated['EventType'] = lgb_model.predict(X_eval)

output_file = "predicted_event_types_lightgbm_old.csv"
eval_aggregated[['ID', 'EventType']].to_csv(output_file, index=False)
print(f"Predictions saved to {output_file}") 