<a href="https://colab.research.google.com/github/mariamamgad8/HumanAI_GSoC_Proposal_2025/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **retrieving and storing filtered social media posts**

In [1]:
import tweepy
import pandas as pd
import re
import json
from datetime import datetime

# Twitter API credentials (replace with your own)
BEARER_TOKEN = "your API"

# Predefined keywords related to mental health
KEYWORDS = [
    "depressed", "depression", "anxious", "anxiety",
    "suicidal", "want to die", "addiction help",
    "overwhelmed", "mental health", "therapy",
    "self harm", "lonely", "hopeless", "panic attack",
    "stress"
]

# Clean text function
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from hashtags
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove emojis and special characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Initialize Twitter client
client = tweepy.Client(bearer_token=BEARER_TOKEN)

def search_tweets(keywords, max_results=100):
    # Combine keywords with OR for the query
    query = " OR ".join(keywords) + " -is:retweet lang:en"

    try:
        # Search recent tweets (free tier allows last 7 days only)
        tweets = client.search_recent_tweets(
            query=query,
            tweet_fields=["created_at", "public_metrics"],
            max_results=max_results
        )

        return tweets.data if tweets.data else []
    except Exception as e:
        print(f"Error searching tweets: {e}")
        return []

def process_tweets(tweets):
    processed = []
    for tweet in tweets:
        cleaned_text = clean_text(tweet.text)
        metrics = tweet.public_metrics

        processed.append({
            "id": tweet.id,
            "timestamp": tweet.created_at.isoformat(),
            "content": tweet.text,
            "cleaned_content": cleaned_text,
            "likes": metrics["like_count"],
            "retweets": metrics["retweet_count"],
            "replies": metrics["reply_count"],
            "impressions": metrics["impression_count"] if "impression_count" in metrics else None
        })

    return processed

def save_data(data, filename_prefix="mental_health_tweets"):
    # Save as JSON
    json_filename = f"{filename_prefix}.json"
    with open(json_filename, 'w') as f:
        json.dump(data, f, indent=2)

    # Save as CSV
    df = pd.DataFrame(data)
    csv_filename = f"{filename_prefix}.csv"
    df.to_csv(csv_filename, index=False)

    print(f"Data saved to {json_filename} and {csv_filename}")

def main():
    print("Searching for mental health related tweets...")
    tweets = search_tweets(KEYWORDS)

    if not tweets:
        print("No tweets found matching the criteria.")
        return

    print(f"Found {len(tweets)} tweets. Processing...")
    processed_tweets = process_tweets(tweets)

    # Save the data
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename_prefix = f"mental_health_tweets_{timestamp}"
    save_data(processed_tweets, filename_prefix)

    print("Done!")

if __name__ == "__main__":
    main()

Searching for mental health related tweets...
Found 100 tweets. Processing...
Data saved to mental_health_tweets_20250406_091537.json and mental_health_tweets_20250406_091537.csv
Done!


# **cleaning dataset to make it ready for NLP analysis**

In [5]:
import pandas as pd
import re
import string
import emoji
import nltk
from nltk.corpus import stopwords
from better_profanity import profanity

# === Setup ===
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
profanity.load_censor_words()

# === Load dataset ===
df = pd.read_csv('/content/mental_health_tweets.csv') #replace it with the correct location

# === Helper: reduce stretched words (e.g., "soooo" → "soo") ===
def reduce_stretch(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)

# === Cleaning Function ===
def clean_text(text):
    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize and filter
    words = text.split()
    cleaned_words = []

    for word in words:
        # Reduce stretched characters
        word = reduce_stretch(word)

        # Skip stopwords and short gibberish
        if word in stop_words or len(word) < 2:
            continue

        # Remove profane or inappropriate words
        if profanity.contains_profanity(word):
            continue

        cleaned_words.append(word)

    return ' '.join(cleaned_words)

# === Apply cleaning ===
df['cleaned_text'] = df['content'].astype(str).apply(clean_text)

# === Save cleaned version ===
df.to_csv("/content/cleaned_tweets.csv", index=False)

print(" Cleaned dataset saved as 'cleaned_tweets.csv'")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Cleaned dataset saved as 'cleaned_tweets.csv'
