In [None]:
import os
import pandas as pd
import numpy as np
import ast
import re
import gc
from pathlib import Path
from cleantext import clean

## Define the data directory and the merged files

In [None]:
data_dir = Path('./output') 

merged_files = [
    "20220227-0302_merged.csv.gz",
    "20220330-0405_merged.csv.gz",
    "20220518-0524_merged.csv.gz",
    "20220623-0701_merged.csv.gz",
    "20220930-1006_merged.csv.gz",
    "20221109-1115_merged.csv.gz",
    "20230301-0305_merged.csv.gz",
    "20230518-0524_merged.csv.gz"
]

clean_output_dir = data_dir / "cleaned"
clean_output_dir.mkdir(exist_ok=True)

## Define the columns

In [None]:
all_columns = [
    "userid", "username", "acctdesc", "location", "following", "followers", "totaltweets",
    "usercreatedts", "tweetid", "tweetcreatedts", "retweetcount", "text", "hashtags",
    "language", "coordinates", "favorite_count", "is_retweet",
    "original_tweet_id", "original_tweet_userid", "original_tweet_username",
    "in_reply_to_status_id", "in_reply_to_user_id", "in_reply_to_screen_name",
    "is_quote_status", "quoted_status_id", "quoted_status_userid", "quoted_status_username",
    "extractedts"
]

## Data loading and unification

In [None]:
def load_and_unify_columns(filepath, all_cols):
    # load data
    df = pd.read_csv(filepath, compression='gzip', encoding='utf-8', engine='python')
    # unify columns
    for col in all_cols:
        if col not in df.columns:
            if col in ["is_retweet", "is_quote_status"]:
                df[col] = False
            else:
                df[col] = np.nan

    df = df[all_cols]
    return df

## Data cleaning and preprocessing

In [None]:
# Pre-compile the regular expression for extracting hashtags
HASHTAG_TEXT_RE = re.compile(r'["\']text["\']\s*:\s*["\']([^"\']+)["\']', re.IGNORECASE)

def parse_and_clean_hashtags_regex(hashtags_str):
    """
    Parse and clean hashtags from a given string.
    :param hashtags_str: The input string containing hashtags in a specific format.
    :return: A list of unique and lowercase hashtags.
    """
    if pd.isna(hashtags_str) or not hashtags_str.strip():
        return []
    
    matches = HASHTAG_TEXT_RE.findall(hashtags_str)
    
    # Convert to lowercase and remove duplicates
    cleaned_hashtags = {tag.strip().lower() for tag in matches if tag.strip()}
    
    return list(cleaned_hashtags)

def clean_tweet_text(text):
    """
    Clean tweet text using the clean-text library.
    :param text: The raw tweet text.
    :return: The cleaned text.
    """
    cleaned = clean(
        text,
        fix_unicode=True,  # Fix potential Unicode issues
        to_ascii=True,  # Convert to ASCII, helpful for removing non-English characters
        lower=True,  # Convert to lowercase
        no_line_breaks=True,  # Remove line breaks
        no_urls=True,  # Remove URLs
        no_emails=True,  # Remove email addresses
        no_phone_numbers=True,  # Remove phone numbers
        no_numbers=False,  # Retain numbers
        no_digits=False,  # Retain digit characters
        no_currency_symbols=True,  # Remove currency symbols
        no_punct=True,  # Remove punctuation
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_currency_symbol="",
        no_emoji=False,  # Do not remove emojis
        lang="en"  # Specify English language
    )
    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned


for f in merged_files:
    filepath = data_dir / 'merged' / f
    print(f"Processing file: {filepath.name}")

    # Load data and ensure all columns are unified
    #df = load_and_unify_columns(filepath, all_columns)

    # Read the data
    df = pd.read_csv(filepath, compression='gzip', encoding='utf-8', engine='python')

    # Filter only English tweets
    df = df[df['language'] == 'en']

    # Remove duplicates
    df.drop_duplicates(subset='tweetid', keep='first', inplace=True)

    # Convert timestamps
    df['tweetcreatedts'] = pd.to_datetime(df['tweetcreatedts'], errors='coerce')
    df['extractedts'] = pd.to_datetime(df['extractedts'], errors='coerce')

    # Remove invalid rows where 'tweetcreatedts' or 'text' is missing
    df = df.dropna(subset=['tweetcreatedts', 'text'])

    # Clean the 'text' field using the cleantext function
    df['text'] = df['text'].apply(clean_tweet_text)
    df = df[df['text'].str.strip() != '']
    
    # Further filter out rows with empty or meaningless text
    df = df[df['text'].str.strip() != '']

    # Extract and normalize hashtags
    df['hashtags'] = df['hashtags'].apply(parse_and_clean_hashtags_regex)

    # Reset the index
    df.reset_index(drop=True, inplace=True)

    # Save the cleaned data
    clean_file = clean_output_dir / f.replace('_merged.csv.gz', '_cleaned.csv.gz')
    df.to_csv(clean_file, index=False, compression='gzip')
    print(f"Saved cleaned data to: {clean_file}")

    del df
    gc.collect()

print("Data cleaning and preprocessing completed for all files.")