In [5]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from affinity.load_data.scraper import scrape, scraper


In [7]:
# We need the labels that we want to predict (Wei has the list)
# convert these categories into vectors so we can compare them to the outcome from the scraped text

categories = ["sports","travel","fashion"]

## import & clean data

In [7]:
sample

Unnamed: 0,Site / App ID,Unnamed: 3
176,exchangeandmart.co.uk,amazon.co.uk
522,tatler.com,gardenersworld.com
580,washingtonpost.com,motherandbaby.co.uk
154,banburycake.co.uk,newsweek.com
711,,accringtonobserver.co.uk
593,whattoexpect.com,barcablaugranes.com
579,wired.com,edinburghlive.co.uk
253,foreverwestham.com,skysports.com
442,new-magazine.co.uk,encyclopedia.com
371,techhive.com,viamichelin.com


In [10]:
#import data (scraping 100 urls) > text



data = pd.read_csv("/Users/martafillolbruguera/code/affinity_at_scale/data/urls.csv", sep=",")
data = data.drop(columns=["Unnamed: 1","Unnamed: 2"])
sample = data.sample(frac=0.05)

def scrape_with_exc(url):
    url = "https://" + url
    try:
        text = scraper(url)
    except:
        text = ""
    return text

sample["Texts"] = sample["Unnamed: 3"].apply(scrape_with_exc)
sample

  scraped_data = pd.Series(scraped_data)


Unnamed: 0,Site / App ID,Unnamed: 3,Texts
462,sportsmole.co.uk,thisismoney.co.uk,Could a future government really be tempted to...
548,uncut.co.uk,closeronline.co.uk,
50,football365.com,90min.com,key learnings from England 's first Euro warm ...
249,deliaonline.com,talksport.com,Live Radio Breaking Sports News Opinion talkSP...
556,olivemagazine.com,cosmopolitan.co.uk,
331,lse.co.uk,englishclub.com,
169,diydata.com,gamefaqs.com,We 've detected unusual traffic from your curr...
264,fleetnews.co.uk,hamhigh.co.uk,Uniqlo to open new multi storey flagship store...
469,thisislancashire.co.uk,countryliving.com,Yes Cottagecore is Still Very Much a Thing The...
267,barkinganddagenhampost.co.uk,whoscored.com,


In [13]:
clean_texts = sample.loc[sample.Texts != ""]

In [15]:
#cleaning

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

# Apply to all texts
clean_texts['clean_text'] = clean_texts.Texts.apply(clean)

clean_texts.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_texts['clean_text'] = clean_texts.Texts.apply(clean)


Unnamed: 0,Site / App ID,Unnamed: 3,Texts,clean_text
462,sportsmole.co.uk,thisismoney.co.uk,Could a future government really be tempted to...,could future government really tempted cap isa...
50,football365.com,90min.com,key learnings from England 's first Euro warm ...,key learning england first euro warm win kylia...
249,deliaonline.com,talksport.com,Live Radio Breaking Sports News Opinion talkSP...,live radio breaking sport news opinion talkspo...
169,diydata.com,gamefaqs.com,We 've detected unusual traffic from your curr...,detected unusual traffic current system blocke...
264,fleetnews.co.uk,hamhigh.co.uk,Uniqlo to open new multi storey flagship store...,uniqlo open new multi storey flagship store ki...


## LDA model

In [19]:
#LDA MODEL

vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(clean_texts['clean_text'])
lda_model = LatentDirichletAllocation(n_components=5)
lda_vectors = lda_model.fit_transform(data_vectorized)

In [20]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
print_topics(lda_model, vectorizer)

Topic 0:
[('england', 29.20079229152426), ('real', 26.200822289782156), ('madrid', 26.19999978584117), ('league', 21.200194604970367), ('mining', 21.19999957402673), ('euro', 20.200183438148088), ('man', 19.200601581102354), ('chelsea', 18.20019735426378), ('champion', 17.200367212496655), ('new', 16.200250197401584)]
Topic 1:
[('best', 24.202491395862335), ('gift', 14.200656426202922), ('day', 10.201074026228389), ('idea', 9.201166964522123), ('summer', 8.20096089338542), ('beach', 7.199999768034039), ('quote', 7.199999759694797), ('get', 6.200343291223468), ('hiv', 6.1999996979944605), ('medicinenet', 6.1999996979944605)]
Topic 2:
[('year', 43.200543849837615), ('new', 27.199927439275292), ('time', 25.20052587190709), ('future', 24.20032927234312), ('product', 24.200221665474036), ('money', 23.20014853938362), ('life', 21.200271727165024), ('goodtoknow', 21.199999831740687), ('expert', 20.200138145561244), ('site', 19.20090162165813)]
Topic 3:
[('new', 18.20023135731435), ('local', 1

In [22]:
#apply the model on new data


new_data = ["This text is about woman in England"]

new_data_vectorized = vectorizer.transform(new_data)
lda_vectors = lda_model.transform(new_data_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

# We have the base lda - unsupervised
# Kmeans to determine number of clusters in our data
# Update LDA number of clusters to the relevant one


topic 0 : 0.7292821490918172
topic 1 : 0.06685238726749793


## pre trained model

In [1]:
#Load pre trained model
model = SentenceTransformer('all-mpnet-base-v2')

  from tqdm.autonotebook import tqdm, trange


(768,)


In [5]:
#embedding categories we want to vectorize to be able to compare to our audience

sports = model.encode("sports for young men and children")
fashion = model.encode("fashion")

#content from scraping urls

embeddings_football = model.encode("I want to start playing football")
embeddings_fashion = model.encode("I need a new outfit for the party")

In [8]:
#compare the categories (interests from audience vs topics detected from text)
fashion @ embeddings_fashion

0.33317417

In [12]:
topic1 = model.encode('edu game team line hockey')
topic1 @ sports

0.34371024

In [14]:
#creates dataframe with text and embedding. ¿¿Why do we need this??


df = pd.DataFrame({"texts": ["New text", "We need more sport"]})
def embed(text):
    embedding = model.encode(text)
    return embedding

df["embedding"] = df.texts.apply(embed)

In [15]:
df

Unnamed: 0,texts,embedding
0,New text,"[0.030395228, -0.011221863, 0.017383024, 0.001..."
1,We need more sport,"[-0.011357048, 0.11302859, -0.029605003, 0.040..."
