* A good way to assess each post on how they keep being accessible for people with special needs, is to rate them using a percentage of how accessible they are.

* So, this Notebook gives weights on all the different kinds of problems that descriptions may have, so that we can know how accessible each post is.

In [None]:
# Run the first 2 cells to import or install any necessary dependencies
!pip install emoji

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import pandas as pd
from collections import Counter
import ast
from transformers import pipeline
import requests
import regex as re
import unicodedata
import emoji

In [None]:
# Upload CSV on Colab, then write click on the file, select copy path and paste it below

csv_url = "the CSV file"
df = pd.read_csv(csv_url)

# Create a folder named "LLMHashtag" and upload inside it the files for the
# parameters of the Hashtag LLM

model = BertForSequenceClassification.from_pretrained("hashtag LLM")
tokenizer = BertTokenizer.from_pretrained("hashtag LLM")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Label mapping
label_map = {"LABEL_0": "Inaccessible", "LABEL_1": "Accessible"}



# Scoring Constants. These constants can give different importance on each topic
# about the posts being accessible

#Alt-text related
SCORE_IF_ALT_TEXT_EMPTY = 0
SCORE_IF_ALT_TEXT_IS_AI_GENERATED = 1
SCORE_IF_ALT_TEXT_IS_CUSTOM_MADE = 4

#Hashtag related
SCORE_IF_HASHTAG_IS_ACCESSIBLE = 5
SCORE_IF_HASHTAG_IS_INACCESSIBLE = 0
SCORE_IF_THERE_IS_NO_HASHTAG = 0
SCORE_HASHTAGS_IN_LAST_LINE = 5
SCORE_HASHTAGS_NOT_IN_LAST_LINE = 0
SCORE_HASHTAG_COUNT_OK = 3
SCORE_HASHTAG_COUNT_TOO_MANY = 0

#Font related
SCORE_IF_FONT_IS_SIMPLE = 1
SCORE_IF_FONT_IS_FANCY = 0

#Emoji related
SCORE_IF_NUMBER_OF_EMOJIS_IS_ACCEPTABLE = 1
SCORE_IF_NUMBER_OF_EMOJIS_IS_NOT_ACCEPTABLE = 0


# This function uses the LLM to check if each hashtag is accesible or not
def score_hashtags_from_list(row):
    try:
        hashtags = ast.literal_eval(row)
        if not isinstance(hashtags, list) or not hashtags:
            return SCORE_IF_THERE_IS_NO_HASHTAG
    except:
        return SCORE_IF_THERE_IS_NO_HASHTAG

    scores = []
    for tag in hashtags:
        result = classifier(tag)[0]
        label = label_map[result["label"]]
        score = SCORE_IF_HASHTAG_IS_ACCESSIBLE if label == "Accessible" else SCORE_IF_HASHTAG_IS_INACCESSIBLE
        scores.append(score)

    return round(sum(scores) / len(scores), 2)

# This function checks if the description is empty, AI-generated or custom-made
def score_alt_text(alt_text):
    if pd.isna(alt_text) or str(alt_text).strip() == "":
        return SCORE_IF_ALT_TEXT_EMPTY
    elif "may be" in str(alt_text).lower():
        return SCORE_IF_ALT_TEXT_IS_AI_GENERATED
    else:
        return SCORE_IF_ALT_TEXT_IS_CUSTOM_MADE

# This function checks if inside the description exist any different font than
# the default one
def is_fancy_font(text):
    if pd.isna(text) or not isinstance(text, str):
        return SCORE_IF_FONT_IS_SIMPLE

    for char in text:
        if char.isalpha():
            name = unicodedata.name(char, "")
            if any(fancy in name for fancy in [
                "MATHEMATICAL",
                "FULLWIDTH",
                "DOUBLE-STRUCK",
                "CIRCLED",
                "SQUARED",
                "MONOSPACE",
                "FRAKTUR",
                "SCRIPT",
            ]):
                return SCORE_IF_FONT_IS_FANCY

    return SCORE_IF_FONT_IS_SIMPLE

emoji_pattern = re.compile(r'\X', re.UNICODE)

# This function checks if a description has more than 3 emojis in a row
def has_excessive_emojis(text):
    if pd.isna(text) or not isinstance(text, str):
        return SCORE_IF_NUMBER_OF_EMOJIS_IS_ACCEPTABLE

    emoji_run = 0
    for token in emoji_pattern.findall(text):
        if emoji.is_emoji(token):
            emoji_run += 1
            # You can change the number of emojis that are acceptable by
            # changing the number below
            if emoji_run > 3:
                return SCORE_IF_NUMBER_OF_EMOJIS_IS_NOT_ACCEPTABLE
        else:
            emoji_run = 0
    return SCORE_IF_NUMBER_OF_EMOJIS_IS_ACCEPTABLE

# This function checks if the hashtags in a description are grouped in the
# last line
def score_hashtags_last_line(description: str) -> int:
    if not description or not isinstance(description, str):
        return SCORE_HASHTAGS_IN_LAST_LINE

    lines = [line.strip() for line in description.strip().splitlines() if line.strip()]
    if not lines:
        return SCORE_HASHTAGS_IN_LAST_LINE

    all_hashtags = re.findall(r"#\w+", description)
    if not all_hashtags:
        return SCORE_HASHTAGS_IN_LAST_LINE

    last_line = lines[-1]
    hashtags_in_last_line = re.findall(r"#\w+", last_line)

    if len(hashtags_in_last_line) == len(all_hashtags):
        return SCORE_HASHTAGS_IN_LAST_LINE
    else:
        return SCORE_HASHTAGS_NOT_IN_LAST_LINE

# This function checks the number of hastags that exist in a description
def score_hashtag_count_limit(description: str) -> int:
    if not description or not isinstance(description, str):
        return SCORE_HASHTAG_COUNT_OK

    hashtags = re.findall(r"#\w+", description)
    # You can change the acceptable number of hashtags in a description
    # by changing the number below
    return SCORE_HASHTAG_COUNT_OK if len(hashtags) <= 5 else SCORE_HASHTAG_COUNT_TOO_MANY


df["alt_text"] = df["alt_text"].fillna("")
df["hashtags"] = df["hashtags"].fillna("[]")
df["description"] = df["description"].fillna("")

df['alt_text_score'] = df['alt_text'].apply(score_alt_text)
df["hashtag_score"] = df["hashtags"].apply(score_hashtags_from_list)
df["font_style_score"] = df["description"].apply(is_fancy_font)
df["emoji_row_score"] = df["description"].apply(has_excessive_emojis)
df["hashtag_last_line_score"] = df["description"].apply(score_hashtags_last_line)
df["hashtag_count_score"] = df["description"].apply(score_hashtag_count_limit)

results = pd.DataFrame({
    "user_posted": df["user_posted"],
    'alt_text_score': df['alt_text_score'],
    'hashtag_score': df['hashtag_score'],
    "font_style_score": df["font_style_score"],
    "emoji_row_score": df["emoji_row_score"],
    "hashtag_last_line_score": df["hashtag_last_line_score"],
    "hashtag_count_score": df["hashtag_count_score"]

})
results.head(50)

In [None]:
# Final Score Aggregation and Normalization

# Define all score column names
score_columns = [
    'alt_text_score',
    'hashtag_score',
    'font_style_score',
    'emoji_row_score',
    'hashtag_last_line_score',
    'hashtag_count_score',
]

# Sum the raw scores
df["raw_score_total"] = df[score_columns].sum(axis=1)

# Normalize so the highest score is 100

#!max_score = df["raw_score_total"].max()
#!df["final_score"] = df["raw_score_total"] / max_score * 100

# Use raw score as final score without normalization
df["final_score"] = df["raw_score_total"].round(2)

#View the top 50 results
display(df[["user_posted", "final_score"]].head(50))

# Save to CSV
df[["user_posted"] + score_columns + ["final_score"]].to_csv("/content/final_scores_per_post.csv", index=False)


In [None]:
# Per-User Final Score Aggregation and Normalization


# Define score columns
score_columns = [
    'alt_text_score',
    'hashtag_score',
    'font_style_score',
    'emoji_row_score',
    'hashtag_last_line_score',
    'hashtag_count_score',
]

# Step 1: Count posts per user
post_counts = df["user_posted"].value_counts().reset_index()
post_counts.columns = ["user_posted", "post_count"]

# Step 2: Keep only users with ≥10 posts
eligible_users = post_counts[post_counts["post_count"] >= 10]["user_posted"]
df_filtered = df[df["user_posted"].isin(eligible_users)]

# Step 3: Group and calculate per-user average scores
user_scores = df_filtered.groupby("user_posted")[score_columns].mean().reset_index()

# Step 4: Merge with post counts
user_scores = user_scores.merge(post_counts, on="user_posted")

# Step 5: Compute total raw score and normalize
user_scores["raw_score_total"] = user_scores[score_columns].sum(axis=1)
max_score = user_scores["raw_score_total"].max()
user_scores["final_score"] = (user_scores["raw_score_total"] / max_score * 100).round(2)

# Step 6: Show top results
print(user_scores[["user_posted", "post_count", "final_score"]].sort_values(by="final_score", ascending=False).head(50))

# Step 7: Save to CSV
user_scores.to_csv("/content/user_normalized_final_scores.csv", index=False)

user_scores.to_csv("/content/final_scores_per_user.csv", index=False)

