In [2]:
# Example of Urls
# url = 'https://medium.com/@getgoing.ca/the-ultimate-guide-to-choosing-the-perfect-car-19747eec95e'
url = 'https://datascience.codata.org/'
url ='https://www.vogue.com/fashion'

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Function to scrape text data from a given URL
def scrape(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    scraped_data = []
    html_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']
    for tag in html_tags:
        elements = soup.find_all(tag)
        texts = [element.text.strip() for element in elements if len(element.text.strip()) > 30]
        scraped_data.extend(texts)
    scraped_data = pd.Series(scraped_data)
    return scraped_data

# Function to clean the scraped text data
def text_cleaner(text):
    serie_joined = ' '.join(text)
    doc = nlp(serie_joined)
    clean_words = []
    for each in doc:
        if each.is_digit or each.like_url or each.like_email or each.is_punct:
            continue
        else:
            clean_words.append(each)
    return clean_words

# Wrapper function to scrape and clean text data from a given URL
def scraper(url):
    scr = scrape(url)
    clt = text_cleaner(scr)
    words = [each.text for each in clt]
    return ' '.join(words)

# Function to process multiple URLs
def scraper_results(url_inputs):
    results = []
    for url in url_inputs:
        result = scraper(url)
        results.append(result)
    return results

# Load sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')

# Function to embed text using the model
def embed(text):
    embedding = model.encode(text)
    return embedding

# List of target interests
targets = [
    "Fashion", "Pets", "Cooking", "Fitness", "Movies", "Gaming", "Travel",
    "Cars and automobiles", "Outdoor activities", "Books", "Finance and investments",
    "Business and entrepreneurship", "Photography", "Art", "Social causes and activism",
    "Health and wellness", "Gardening", "Technology", "Education and learning",
    "Sports", "Nature", "History", "Parenting and family", "Music",
    "Food and dining", "DIY and crafts", "Beauty", "Science", "Politics"
]

# Create DataFrame of interests and their embeddings
interests = pd.DataFrame({"interest": targets})
interests["embedding"] = interests.interest.apply(embed)

# Scrape and embed text data from the given URL
text_embedding = embed(scraper(url))

# Calculate cosine similarity between the text embedding and interest embeddings
similarities = interests['embedding'].apply(lambda x: cosine_similarity([x], [text_embedding])[0][0])

# Find the best fit interest
best_fit_index = similarities.idxmax()
best_fit_interest = interests.loc[best_fit_index, 'interest']

# Print the best fit interest
print(f"The best fit interest is: {best_fit_interest}")

# Load additional datasets
interests_df = pd.read_csv('/Users/poloniki/code/marta/example_project/data_v2.csv')
social_media_data =  ds/SocialMediaUsersDataset.csv')
clusters = pd.read_csv('/Users/poloniki/code/marta/example_project/data_v2.csv')

# Filter data by best fit interest
filter_by_interest = clusters.loc[clusters.Interests.str.contains(best_fit_interest)]

# Find the cluster with the most occurrences of the best fit interest
best_cluster = filter_by_interest.cluster.value_counts().index[0]

# Calculate the average age of the best cluster
avg_age_of_cluster = clusters.loc[clusters.cluster == best_cluster].Age.mean()

# Get data of the best cluster
best_cluster_df = clusters.loc[clusters.cluster == best_cluster]

# Evaluate the interests in the best cluster
best_cluster_df['Interests_eval'] = best_cluster_df.Interests.apply(lambda x: list(eval(x) if ',' in x else [x]))

# Find the top 5 other interests in the best cluster
top_5_other_interests = best_cluster_df.explode('Interests_eval').Interests_eval.value_counts()[1:5]


In [4]:
# Output the best fit interest, average age, and top 5 other interests
print(best_fit_interest, int(avg_age_of_cluster), top_5_other_interests)


Fashion 55 Interests_eval
Outdoor activities    1907
Movies                1906
DIY and crafts        1884
Fitness               1874
Name: count, dtype: int64


# Code Explanation

## Data Scraping and Cleaning

1. **Scrape Data from URL:**
   - The `scrape(url)` function collects text data from the given URL. It looks for text inside specific HTML tags like `h1`, `h2`, `p`, etc.
   
2. **Clean the Scraped Text:**
   - The `text_cleaner(text)` function cleans the collected text by removing numbers, URLs, emails, and punctuation marks. This helps in keeping only the meaningful words.

3. **Combine Scraping and Cleaning:**
   - The `scraper(url)` function uses both the `scrape(url)` and `text_cleaner(text)` functions to get clean text data from the URL.
   - The `scraper_results(url_inputs)` function processes multiple URLs by applying the `scraper(url)` function to each one.

## Embedding and Similarity Calculation

### What is an Embedding?
- An embedding is a way to convert text into numbers so that we can use it with machine learning models. Here, we use the `SentenceTransformer` model to create embeddings for our text.

1. **Load Sentence Transformer Model:**
   - The `SentenceTransformer` model is used to convert the text data into embeddings (numerical representations).

2. **Create Embeddings:**
   - The `embed(text)` function generates an embedding for the given text using the model.

3. **List of Target Interests:**
   - We have a list of interests (like Fashion, Pets, Cooking, etc.) and we create embeddings for each interest.

4. **Calculate Similarity:**
   - We calculate the similarity between the text embedding (from the URL) and the interest embeddings. This helps us find the interest that best matches the text data.

## Cluster Analysis

1. **Load Additional Data:**
   - We load datasets that Wei created, which have user profiles and their retrospective clusters.

2. **Filter Data by Best Fit Interest:**
   - We filter the dataset to find users who have an interest that matches our best fit interest.

3. **Identify the Best Cluster:**
   - We find the cluster with the most occurrences of the best fit interest.

4. **Calculate Average Age:**
   - We calculate the average age of users in the identified cluster.

5. **Find Top 5 Other Interests:**
   - We look at the top 5 other interests that users in the best cluster have.

## Output

- The code outputs the best fit interest, the average age of users in the best cluster, and the top 5 other interests in that cluster.
