In [None]:
# Install required libraries
!pip install pandas transformers geopy vaderSentiment



In [None]:
import tarfile
import json
import os
import pandas as pd
import re
from transformers import pipeline, AutoTokenizer
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load sentiment analysis model & tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
classifier = pipeline("sentiment-analysis", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Device set to use cpu


In [None]:
# Define path to tweets folder in Drive
tweets_tar_path = "/content/drive/MyDrive/alaska_vaccine_project/tweets/gemma-keras-gemma_1.1_instruct_2b_en-v4.tar.gz"
tweets_csv_folder = "/content/drive/MyDrive/alaska_vaccine_project/tweets/csv_tweets"

In [None]:
# Updated Function to extract tweets from tar.gz
def extract_tweets_from_tar(file_path):
    tweets = []
    with tarfile.open(file_path, "r:gz") as tar:
        for member in tar.getmembers():
            f = tar.extractfile(member)
            if f:
                for line in f:
                    try:
                        # Decode the line before loading as JSON
                        tweet = json.loads(line.decode('utf-8', errors='replace'))
                        tweets.append(tweet)
                    except json.JSONDecodeError:
                        # If 'utf-8' fails, try 'latin-1'
                        try:
                            tweet = json.loads(line.decode('latin-1', errors='replace'))
                            tweets.append(tweet)
                        except json.JSONDecodeError:
                            continue
    # Ensure only dictionary elements are retained
    tweets = [t for t in tweets if isinstance(t, dict)]
    return tweets

In [None]:
# Function to safely load a CSV file with multiple encoding attempts
def load_csv_safely(file_path):
    encodings = ["latin-1", "ISO-8859-1", "utf-8"]  # Common encodings to try

    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding, engine="python", on_bad_lines="skip")
            return df  # Return dataframe if successful
        except Exception as e:
            print(f"⚠️ Failed to read {file_path} with {encoding}: {e}")

    print(f"🚨 Skipping {file_path} - Unable to read with known encodings.")
    return None  # Return None if all attempts fail


In [None]:
# Function to load tweets from multiple CSV files in a folder
def load_tweets_from_csv_folder(folder_path):
    all_tweets = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = load_csv_safely(file_path)  # Try to read with multiple encodings

            if df is None:
                continue  # Skip if file couldn't be read

            # Ensure 'description' column exists
            if "description" not in df.columns:
                print(f"Skipping {filename} - No 'description' column found. Available columns: {df.columns.tolist()}")
                continue  # Skip file

            # Use 'user_location' if available, otherwise create an empty column
            location_column = "user_location" if "user_location" in df.columns else None
            if location_column is None:
                df["user_location"] = ""

            # Keep only necessary columns and rename for consistency
            tweets = df[["description", "user_location"]].rename(columns={"description": "text"}).to_dict(orient="records")
            all_tweets.extend(tweets)

    return all_tweets

In [None]:
# Function to check if a location is in Alaska (Handles multiple variations)
def is_in_alaska(location):
    if not isinstance(location, str):
        return False

    alaska_patterns = [
        r"\bAlaska\b",  # Matches "Alaska"
        r"\bAK\b",  # Matches "AK"
        r"Anchorage", r"Sitka", r"Juneau", r"Fairbanks",  # Major cities
        r"\bAK, USA\b", r"\bAlaska, USA\b",  # Common variations
        r"\bAK,\s*\w{2,}\b"  # Matches things like "AK, US" or "AK, Canada"
    ]

    return any(re.search(pattern, location, re.IGNORECASE) for pattern in alaska_patterns)


In [None]:
# Function to classify as rural or urban based on location keywords
def classify_rural_urban(location):
    if not isinstance(location, str):
        return "Unknown"

    rural_areas = {"Bethel", "Nome", "Barrow", "Kotzebue", "Wrangell"}  # Example rural areas
    urban_areas = {"Anchorage", "Fairbanks", "Juneau", "Sitka"}  # Example urban areas

    for city in rural_areas:
        if city.lower() in location.lower():
            return "Rural"
    for city in urban_areas:
        if city.lower() in location.lower():
            return "Urban"
    return "Unknown"

In [None]:
# Function to truncate long tweets to 512 tokens
def truncate_text(text):
    tokens = tokenizer.encode(text, truncation=True, max_length=512)
    return tokenizer.decode(tokens, skip_special_tokens=True)

In [None]:
# Function to process tweets
def process_tweets(file_path, data_source="tar"):
    if data_source == "tar":
        tweets = extract_tweets_from_tar(file_path)
    elif data_source == "csv":
        tweets = load_tweets_from_csv_folder(file_path)
    else:
        raise ValueError("Invalid data source. Use 'tar' or 'csv'.")

    if not tweets:
        print("No valid tweets found. Exiting...")
        return pd.DataFrame()  # Return empty DataFrame if no tweets were processed

    alaska_tweets = []

    for tweet in tweets:
        text = tweet.get("text", "")
        location = tweet.get("user_location", "")

        if is_in_alaska(location):
            rural_urban = classify_rural_urban(location)

            # Truncate text to avoid model error
            truncated_text = truncate_text(text)

            sentiment = classifier(truncated_text)[0]["label"]

            alaska_tweets.append({
                "text": truncated_text,
                "location": location,
                "rural_urban": rural_urban,
                "sentiment": sentiment
            })

    return pd.DataFrame(alaska_tweets)

In [None]:
# ⚡ SWITCH DATASET HERE:
data_source = "csv"
#data_source = "tar"

# Select the correct dataset file or folder
file_path = tweets_csv_folder if data_source == "csv" else tweets_tar_path

# Run the analysis
df = process_tweets(file_path, data_source)

# Save results to Drive if tweets were processed
if not df.empty:
    output_path = f"/content/drive/MyDrive/alaska_vaccine_project/tweets_sentiment_{data_source}.csv"
    df.to_csv(output_path, index=False)
    print(f"Analysis complete. Results saved to: {output_path}")
else:
    print("No valid tweets were found. No file was saved.")

# Show first few rows
df.head()

⚠️ Failed to read /content/drive/MyDrive/alaska_vaccine_project/tweets/csv_tweets/vaccine 05.csv with utf-8: 'utf-8' codec can't decode bytes in position 6875-6876: invalid continuation byte
⚠️ Failed to read /content/drive/MyDrive/alaska_vaccine_project/tweets/csv_tweets/vaccine 08.csv with utf-8: 'utf-8' codec can't decode byte 0xcc in position 4811: invalid continuation byte
⚠️ Failed to read /content/drive/MyDrive/alaska_vaccine_project/tweets/csv_tweets/vaccine 09.csv with utf-8: 'utf-8' codec can't decode byte 0xeb in position 5130: invalid continuation byte
⚠️ Failed to read /content/drive/MyDrive/alaska_vaccine_project/tweets/csv_tweets/vaccine 11.csv with utf-8: 'utf-8' codec can't decode byte 0xe2 in position 331: invalid continuation byte
⚠️ Failed to read /content/drive/MyDrive/alaska_vaccine_project/tweets/csv_tweets/vaccine 10.csv with utf-8: 'utf-8' codec can't decode bytes in position 1525-1526: invalid continuation byte
⚠️ Failed to read /content/drive/MyDrive/alaska_v

Unnamed: 0,text,location,rural_urban,sentiment
0,BREAKING REPORT: Lawmaker Introduces Bill to F...,"Alaska, USA",Unknown,LABEL_1
1,A bit of a rant. Sick of the division. politi...,Alaska,Unknown,LABEL_0
2,The COVID vaccine isn\'t the only shot out the...,Alaska,Unknown,LABEL_1
3,BREAKING: The U.S. administered nearly TEN MIL...,"Anchorage,Alaska",Urban,LABEL_2
4,I’d say 441M vaccine doses in US is “something...,Alaska,Unknown,LABEL_1
