In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from affinity.load_data.scraper import scrape, scraper
import requests
import xmltodict
from sklearn.cluster import KMeans
import matplotlib.plot as plt

## Import data: scraping

In [None]:
#sitemap urls extracted from Conde Nast inventory

sitemap_urls = [
    "https://www.houseandgarden.co.uk/sitemap.xml?year=2024&month=6&week=1",
    "https://www.vanityfair.com/sitemap.xml?year=2024&month=6&week=1",
    "https://www.epicurious.com/sitemap.xml?year=2024&month=6&week=1",
    "https://www.epicurious.com/sitemap.xml?year=2024&month=5&week=3",
    "https://www.cntraveller.com/sitemap.xml?year=2024&month=5&week=5",
    "https://www.voguebusiness.com/sitemap.xml?year=2024&month=5&week=5",
    "https://www.voguebusiness.com/sitemap.xml?year=2024&month=5&week=3",
    "https://pitchfork.com/sitemap.xml?year=2024&month=5&week=4",
    "https://www.self.com/sitemap.xml?year=2024&month=6&week=1",
    "https://www.self.com/sitemap.xml?year=2024&month=4&week=3",
    "https://www.worldofinteriors.com/sitemap.xml?year=2024&month=2&week=3",
    "https://www.worldofinteriors.com/sitemap.xml?year=2024&month=5&week=4",
    "https://www.worldofinteriors.com/sitemap.xml?year=2023&month=6&week=2",
    "https://www.architecturaldigest.com/sitemap.xml?year=2024&month=6&week=1",
    "https://www.newyorker.com/sitemap.xml?year=2024&month=6&week=1",
    "https://www.tatler.com/sitemap.xml?year=2024&month=6&week=1"
]

In [None]:
# List of base sitemap URLs with placeholders for year, month, and week
base_sitemap_urls = [
    "https://www.houseandgarden.co.uk/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.vanityfair.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.epicurious.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.epicurious.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.cntraveller.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.voguebusiness.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.voguebusiness.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://pitchfork.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.self.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.self.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.worldofinteriors.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.worldofinteriors.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.worldofinteriors.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.architecturaldigest.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.newyorker.com/sitemap.xml?year={year}&month={month}&week={week}",
    "https://www.tatler.com/sitemap.xml?year={year}&month={month}&week={week}"
]

# Generate combinations of URLs by changing year, month, and week
sitemap_urls = []
for year in range(2020, 2025):
    for month in range(1, 13):
        for week in range(1, 5):
            for base_url in base_sitemap_urls:
                sitemap_urls.append(base_url.format(year=year, month=month, week=week))

# Print the generated sitemap URLs
for url in sitemap_urls:
    print(url)

In [None]:
len(sitemap_urls)

In [None]:
#define function for fetching urls for each sitemap

def fetch_sitemap_urls(sitemap_url):
    try:
        response = requests.get(sitemap_url)
        if response.status_code == 200:

            dict_data = xmltodict.parse(response.content)

            urls = [entry['loc'] for entry in dict_data['urlset']['url']]

            return urls
    except:
        return []
    else:
        print(f"Failed to fetch {sitemap_url}: Status code {response.status_code}")
    return []

In [None]:
#multiprocess to get sitemap urls for the entire sitemap list

from tqdm import tqdm
import concurrent.futures
from tqdm import tqdm

def process_sitemap(sitemap_url):
    return fetch_sitemap_urls(sitemap_url)

all_urls = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    # Use tqdm to show progress bar
    results = list(tqdm(executor.map(process_sitemap, sitemap_urls), total=len(sitemap_urls)))

for result in results:
    all_urls.extend(result)

In [None]:
#save list of urls as csv

inventory_urls= pd.DataFrame()
inventory_urls["urls"]=all_urls
inventory_urls.to_csv("/Users/martafillolbruguera/code/affinity_at_scale/data/inventory_urls.csv",index=False)

## Clean data

In [None]:
#import inventory_urls to scrape

scraped_inventory = pd.read_csv("/Users/martafillolbruguera/code/affinity_at_scale/data/inventory_urls.csv").sample(frac=0.1)

scraped_inventory["texts"] = scraped_inventory["urls"].apply(scraper)
scraped_inventory

In [None]:
clean_texts = scraped_inventory.loc[scraped_inventory.texts != ""]

In [None]:
#clean, tokenize, lemmatize

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

# Apply to all texts
clean_texts['clean_text'] = clean_texts.texts.apply(clean)

clean_texts.head()

## LDA model

In [None]:
#vectorize

vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(clean_texts['clean_text'])

In [None]:
#kmeans to find the k number of clusters to use for the LDA
import matplotlib.plot as plt


inertias = []
ks = range(1,10)
for k in ks:
    km_test = KMeans(n_clusters=k).fit(#DATA)
    inertias.append(km_test.inertia_)

plt.plot(ks, inertias)
plt.xlabel('k cluster number')

In [None]:
#apply LDA and fit to vectorized data using K means to determine the n of components
lda_model = LatentDirichletAllocation(n_components=######)
lda_vectors = lda_model.fit_transform(data_vectorized)

In [None]:
#print topics

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
print_topics(lda_model, vectorizer)

In [None]:
#apply the model on new data to test it

new_data = ["Tiaras are the fashion accessory for high society brides,but how to wear them correctly? While elegant brides across the nation plan their upcoming nuptials, Tatler answers your questions on exactly how best to sport the regal accessory."]

new_data_vectorized = vectorizer.transform(new_data) #vectorize first
lda_vectors = lda_model.transform(new_data_vectorized) #transform using lda model fitted with our inventory_urls

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

## Supervised learning with llama

In [None]:
#extract labels from sitemap topics and use them as targets for supervised learning
#this will allow us to have a prediction probability not only the closest topic

## Pretrained Sentence transformer

In [None]:
#Load pre trained model
model = SentenceTransformer('all-mpnet-base-v2')

In [None]:


targets = [
"Fashion",
"Pets",
"Cooking",
"Fitness",
"Movies",
"Gaming",
"Travel",
"Cars and automobiles",
"Outdoor activities",
"Books",
"Finance and investments",
"Business and entrepreneurship",
"Photography",
"Art",
"Social causes and activism",
"Health and wellness ",
"Gardening",
"Technology",
"Education and learning",
"Sports",
"Nature",
"History",
"Parenting and family",
"Music",
"Food and dining ",
"DIY and crafts ",
"Beauty",
"Science",
"Politics"
]
import pandas as pd
interests = pd.DataFrame({"interest":targets})
