# Estimating the Audience of a Website Using Artificial Intelligence

### By Mauricio Toro, PhD

### https://www.linkedin.com/in/mauricio-toro-phd/

### http://www.github.com/mauriciotoro

## Install the required libraries

In [27]:
!pip3 install --quiet boilerpy3 tabulate nltk transformers

## Import the libraries and define helper functions

In [24]:
from boilerpy3 import extractors
import requests

from transformers import pipeline
from transformers import BartTokenizer, BartForConditionalGeneration

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

from tabulate import tabulate
from IPython.display import display, HTML

In [15]:
# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mauriciotoro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mauriciotoro/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
# 1. Extract text from URL

def extract_text_from_url(url):
    # Send a request to the website
    response = requests.get(url)
    response.raise_for_status()  # Check that the request was successful
    
    # Extract main content using boilerpy3
    extractor = extractors.ArticleExtractor()
    content = extractor.get_content(response.text)
    
    return content

In [17]:
# 2. Text Summarization

# Load pre-trained BART model and tokenizer
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def summarize_text(text, max_length):
    inputs = tokenizer.batch_encode_plus([text], max_length=max_length, truncation=True, return_tensors='pt')
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=30, max_length=max_length, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Function to summarize long texts
def summarize_long_text(long_text, max_length = 130, chunk_size=512, long = False):
    text_chunks = [long_text[i:i+chunk_size] for i in range(0, len(long_text), chunk_size)]
    summaries = [summarize_text(chunk,max_length) for chunk in text_chunks]
    if long:
        return '. '.join(summaries)
    else:    
        return summarize_text('. '.join(summaries),max_length)

In [18]:
# 3. Emotion Analysis

# Load the pre-trained emotion detection model and pipeline
emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

def detect_emotion(text, chunk_size=512):
    # Split the text into chunks
    text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    # Detect emotion for each chunk
    emotions = []
    for chunk in text_chunks:
        result = emotion_pipeline(chunk)
        # Get the emotion label with the highest score for each chunk
        emotion_label = max(result, key=lambda x: x['score'])['label']
        emotions.append(emotion_label)
    
    return emotions

def get_most_common_emotion(emotions):
    # Count the occurrences of each emotion
    emotion_counts = Counter(emotions)
    # Get the most common emotion
    most_common_emotion = emotion_counts.most_common(1)[0][0]
    return most_common_emotion

In [19]:
# 4. Zero Shot Classification
def get_top_zero_shot_prediction(text,category_list):
    zero_shot_pipeline = pipeline(model="facebook/bart-large-mnli")
    results = zero_shot_pipeline(plain_text,category_list, multi_label=False)
    return results['labels'][0]

In [20]:
# 5. Get keywords excluding verbs
def get_keywords_from_text(text):
    # Tokenize and POS tagging
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    # Filter out verbs
    non_verbs = [word for word, pos in pos_tags if pos not in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ')]
    
    # Join non-verbs into a cleaned text
    cleaned_text = ' '.join(non_verbs)
    
    # Vectorize and extract keywords
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([cleaned_text])
    feature_names = vectorizer.get_feature_names_out()

    # Return the words with the highest TF-IDF scores for the document
    scores = X[0].T.todense().A1
    sorted_items = sorted(zip(scores, feature_names), reverse=True)
    return [word for score, word in sorted_items[:5]]


In [21]:
# 6. Get the audience analysis

# Define the types of audience we want to analyse
audience_types = ["gender", "age", "income", "location"]

def get_audience_analysis(text):
    # Define the possible values for each type of audience
    audience_category_lists = [ ["resonates with men","resonates with women"],\
                          ["resonates with young people","resonates with middle age people", "resonates with old people"],\
                          ["resonates with low income","resonates with medium income", "resonates with high income"],\
                          ["resonates with urban people", "resonates with rural people"],\
                        ]    
    audience_dict = {}
    for index, category_list in enumerate(audience_category_lists):
        # Use a zero shot classifier to obtain the audience for each category
        audience_dict[audience_types[index]] = get_top_zero_shot_prediction(plain_text, category_list)
    return audience_dict 

## Define the URLs of the websites to extract the text

In [22]:
urls = [
    "https://medium.com/@mauriciotorob/streamlining-transaction-categorisation-at-scale-part-2-6f00e8180418",
    "https://www.theguardian.com/technology/article/2024/jul/22/crowdstrike-says-significant-number-of-devices-back-online-after-global-outage",
    "https://www.estherperel.com/focus-on-categories/infidelity",
    "https://www.bbc.co.uk/news/articles/cn078lznklxo",
    "https://www.safecosmetics.org/resources/safe-cosmetics-tips/"
]

## Generate summary, emotions, keywords and audience analysis for all the websites in the list

In [28]:
# Initialize an empty list to store the data
data = []

# Process each URL
for url in urls:
    # Extract and process data
    plain_text = extract_text_from_url(url)
    summary = summarize_long_text(plain_text)
    emotions = detect_emotion(plain_text)
    most_common_emotion = get_most_common_emotion(emotions)
    keywords = get_keywords_from_text(plain_text)
    audience_dict = get_audience_analysis(plain_text)
    
    # Create a row of data
    row = {
        'url': url,
        'summary': summary,
        'emotions': ", ".join(emotions),
        'keywords': ", ".join(keywords),
        'most_common_emotion': most_common_emotion
    }
    
    # Add audience predictions to the row
    for audience_type in audience_types:
        row[audience_type] = audience_dict.get(audience_type, '')
    
    # Append the row to the data list
    data.append(row)

# Create a dataframe from the data
df = pd.DataFrame(data)

# Save the dataframe to a CSV file (optional)
df.to_csv('output.csv', index=False)


## Display a table with the results

In [29]:
# Display the dataframe as a table
table_html = tabulate(df, headers='keys', tablefmt='html', showindex=False)

# Use IPython.display to render the table
display(HTML(table_html))

url,summary,emotions,keywords,most_common_emotion,gender,age,income,location
https://medium.com/@mauriciotorob/streamlining-transaction-categorisation-at-scale-part-2-6f00e8180418,"In the first part of our blog, we talked about the user-centric approach Cheddar adopted to enhance its Personal Finance Manager (PFM) This involved mapping transaction types to spending categories, ensuring the system was intuitive and useful for users. As we move into the second part, we will explore the processes of data cleaning and mappings.","neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral, neutral","merchant, data, banks, transaction, scientists",neutral,resonates with men,resonates with old people,resonates with medium income,resonates with urban people
https://www.theguardian.com/technology/article/2024/jul/22/crowdstrike-says-significant-number-of-devices-back-online-after-global-outage,"CrowdStrike says significant number of devices back online after global outage. But experts says full recovery from Friday's IT failure could take weeks. Thousands of flights were cancelled, broadcasters were forced off air and millions of PCs failed to start after a CrowdStrike software update.","neutral, sadness, neutral, sadness, sadness, sadness, sadness","crowdstrike, number, friday, devices, company",sadness,resonates with women,resonates with young people,resonates with medium income,resonates with urban people
https://www.estherperel.com/focus-on-categories/infidelity,Infidelity is often the first time couples broach conversations they've avoided for years. I prefer to use infidelity as a portal into the complex nature of love. Get clarity on your most intimate romantic relationships.,"neutral, surprise","relationships, infidelity, conversations, sex, life",neutral,resonates with women,resonates with young people,resonates with medium income,resonates with rural people
https://www.bbc.co.uk/news/articles/cn078lznklxo,"Village has 'desperate need' for new medical centre near Norwich. Almost 4,000 more homes have been proposed for Rackheath. Construction work will begin on 5 August and should be complete in spring.","fear, joy, joy, neutral","centre, medical, village, rackheath, 000",joy,resonates with men,resonates with middle age people,resonates with medium income,resonates with rural people
https://www.safecosmetics.org/resources/safe-cosmetics-tips/,Choose products with simpler ingredient lists and hidden ‘fragrance’ ingredients. Avoid products with ‘parfum’ on the label which can hide any number of chemicals. Use Tools like Clearya and Think Dirty to find out whether your go-to products are safe.,"neutral, neutral, neutral, neutral, neutral","products, ingredients, safe, toxic, care",neutral,resonates with women,resonates with young people,resonates with medium income,resonates with urban people
