# Profile-Based Retrieval System

## Model Loading

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

tfidf = pickle.load(open("model-tuning/tfidf_kaggledataset_scikitlearn-experiment.model", 'rb'))
classifier = pickle.load(open("model-tuning/log_classifier_scikitlearn-experiment.model", 'rb'))

## Users Dataset

In [2]:
from random import randint
import random

labels = ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']

def generate_user_dataset(n_users, max_preferences, labels):
    users = [] # id, list of preferred categories
    if max_preferences > len(labels):
        max_preferences = len(labels)
    
    for i in range(0, n_users):
        prefs = random.sample(range(0, 7), randint(1, max_preferences))
        prefs_mapped = [labels[k] for k in prefs]
        
        users.append(tuple((i,prefs_mapped)))
    
    return users
    
def extract_users_by_category(users_df, labels):
    labels_users = {label:[] for label in labels}
    
    for user, prefs in users_df:
        for label in prefs:
            labels_users[label]+=[user]
    
    return labels_users

In [3]:
users = generate_user_dataset(20, 5, labels)
users

[(0, ['ENTERTAINMENT', 'HEALTH', 'SPORTS', 'SCIENCE', 'POLITICS']),
 (1, ['BUSINESS']),
 (2, ['POLITICS', 'SPORTS', 'ENTERTAINMENT', 'HEALTH', 'BUSINESS']),
 (3, ['HEALTH', 'BUSINESS', 'ENTERTAINMENT']),
 (4, ['ENTERTAINMENT', 'POLITICS', 'SCIENCE', 'TRAVEL', 'SPORTS']),
 (5, ['BUSINESS']),
 (6, ['ENTERTAINMENT', 'SPORTS', 'SCIENCE']),
 (7, ['SPORTS', 'ENTERTAINMENT', 'HEALTH', 'BUSINESS']),
 (8, ['ENTERTAINMENT', 'TRAVEL', 'SCIENCE']),
 (9, ['SPORTS', 'BUSINESS', 'POLITICS', 'ENTERTAINMENT', 'HEALTH']),
 (10, ['TRAVEL', 'POLITICS', 'BUSINESS', 'SPORTS', 'ENTERTAINMENT']),
 (11, ['ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'POLITICS', 'TRAVEL']),
 (12, ['HEALTH', 'SPORTS', 'BUSINESS']),
 (13, ['TRAVEL', 'HEALTH', 'SCIENCE', 'BUSINESS']),
 (14, ['SPORTS', 'POLITICS', 'TRAVEL', 'HEALTH', 'BUSINESS']),
 (15, ['BUSINESS', 'POLITICS', 'SPORTS']),
 (16, ['POLITICS', 'SPORTS', 'TRAVEL', 'HEALTH', 'SCIENCE']),
 (17, ['SPORTS']),
 (18, ['BUSINESS', 'SCIENCE', 'HEALTH', 'SPORTS']),
 (19, ['SCIENCE', '

In [4]:
labels_users = extract_users_by_category(users, labels)
labels_users

{'POLITICS': [0, 2, 4, 9, 10, 11, 14, 15, 16],
 'ENTERTAINMENT': [0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 19],
 'HEALTH': [0, 2, 3, 7, 9, 12, 13, 14, 16, 18, 19],
 'TRAVEL': [4, 8, 10, 11, 13, 14, 16, 19],
 'BUSINESS': [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 15, 18],
 'SPORTS': [0, 2, 4, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19],
 'SCIENCE': [0, 4, 6, 8, 11, 13, 16, 18, 19]}

## News Retrieval

In [5]:
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict

In [6]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

### Preprocessing

In [7]:
def preprocess_document(doc, join_words):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    
    if join_words == True:
        return " ".join(final)
    return final

## Use Case #1 - Profile-Based Newsletter

News are gathered from the New York Times website and delivered to interested users

In [8]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus, True),label))) 
    # label is known only for testing purposes, in real world scenario this information can be unavailable


In [9]:
def deliver_articles(classifier, tfidf, articles_df, original_articles, labels_users_df): # takes articles as input and label-users they like that label
    f_articles = tfidf.transform(articles_df).toarray()
    labeled_articles = classifier.predict(f_articles)
    
    i = 0
    for label in labeled_articles:
        print(original_articles[i],"-> DELIVERED TO ->",labels_users_df[label], "FANS OF ",label,"\n")
        i+=1

deliver_articles(classifier, tfidf, [article[0] for article in p_nyt], nyt, labels_users)
    

('Guantánamo Bay as Nursing Home: Military Envisions Hospice Care as Terrorism Suspects Age. An older detainee inside a communal cellblock at the Camp 6 prison in the detention zone at Guantánamo Bay, Cuba.With no sign that the prison will close, the Pentagon has begun planning for detainees to grow old and die at Guantánamo Bay.', 'POLITICS') -> DELIVERED TO -> [0, 2, 4, 9, 10, 11, 14, 15, 16] FANS OF  POLITICS 

('Trump Pulls Out of Arms Treaty During Speech at N.R.A. Convention. The N.R.A. is dealing with inner turmoil, lawsuits and a newly empowered Democratic House. The president’s visit is being thought of as a needed pep talk.', 'POLITICS') -> DELIVERED TO -> [0, 2, 4, 9, 10, 11, 14, 15, 16] FANS OF  POLITICS 

('F.B.I. Warns of Russian Interference in 2020 Race and Boosts Counterintelligence Operations. Intelligence officials have said Russia has kept up its election interference operations under the direction of President Vladimir V. Putin and that they are likely to intensify

### Performance Evaluation

In [10]:
X_unseen, y_unseen = [], []
for corpus, label in p_nyt:
    X_unseen.append(corpus)
    y_unseen.append(label)

X_unseen = tfidf.transform(X_unseen).toarray()

y_pred = classifier.predict(X_unseen)

In [11]:
print(confusion_matrix(y_unseen,y_pred))  
print(classification_report(y_unseen,y_pred))  
print(accuracy_score(y_unseen, y_pred))  

[[29  2  2  6  3  2  4]
 [ 4 24  1  1  0  3  2]
 [ 4  2 19  4  7  1  1]
 [ 2  1  0 19  0  0  0]
 [ 3  0  6  3 11  2  2]
 [ 1  2  0  0  2 13  1]
 [ 1  0  1  1  1  1 15]]
               precision    recall  f1-score   support

     BUSINESS       0.66      0.60      0.63        48
ENTERTAINMENT       0.77      0.69      0.73        35
       HEALTH       0.66      0.50      0.57        38
     POLITICS       0.56      0.86      0.68        22
      SCIENCE       0.46      0.41      0.43        27
       SPORTS       0.59      0.68      0.63        19
       TRAVEL       0.60      0.75      0.67        20

    micro avg       0.62      0.62      0.62       209
    macro avg       0.61      0.64      0.62       209
 weighted avg       0.63      0.62      0.62       209

0.6220095693779905


## Use Case #2 - Profile-Driven Search Engine for News Articles

In [None]:
# retrieve documents from newspaper
# preprocess them as showed by the examples of the teacher
# create tfidf model and dictionary from all those articles
# classify the articles in the corpus
# compute query of the user, how?
    # who is searching? take profile of the user that is writing the query
    # process the query
    # compute relevance score combining classical method + profile of the user
    # rank the results

In [12]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

# experiment
nyt.append(tuple(('Trump announces new Economic Measure to address the high rate of unemployment in the US', 'BUSINESS')))

p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus, False),label))) 
    # label is known only for testing purposes, in real world scenario this information can be unavailable

In [13]:
from gensim import corpora, models, similarities
from operator import itemgetter

def create_labeling_archive(classifier, tfidf, docs): # the tfidf mentioned here is the one obtained during model building
    p_docs = [preprocess_document(corpus, True) for corpus, label in docs]
    X = tfidf.transform(p_docs).toarray()
    y = classifier.predict(X)
    return y

def create_archive(docs, pred_labels):
    p_docs = [preprocess_document(corpus, False) for corpus, label in docs]
    # create dictionary
    dictionary = corpora.Dictionary(p_docs)
    dictionary.save('vsm.dict') # save dictionary
    # docs2bows
    vectors = [dictionary.doc2bow(doc) for doc in p_docs]
    corpora.MmCorpus.serialize('vsm_docs.mm', vectors) # save bows
    # tfidf scores of the archive
    tfidf = models.TfidfModel(vectors)
    tfidf.save('tfidf_archive.scores')
    
    return tfidf, dictionary, vectors

def query_archive(tfidf_archive, dictionary, vectors, docs, pred_labels, user_prefs, q):
    pq = preprocess_document(q, False)
    vq = dictionary.doc2bow(pq)
    
    qtfidf = tfidf_archive[vq]
    
    index = similarities.MatrixSimilarity(vectors, num_features=len(dictionary))
    sim = index[qtfidf]
    sim = enumerate(sim)

    fix_factor = 1.5 # multiplier if the label is part of user preferences
    penalty_factor = 0.5
    fix_sim = []
    for doc, score in sim:
        if pred_labels[doc] in user_prefs:
            fix_sim.append(score * fix_factor)
        else:
            fix_sim.append(score * penalty_factor)

    ranking = sorted(enumerate(fix_sim), key=itemgetter(1), reverse=True)

    for doc, score in ranking:
        print("[ Score = " + "%.3f" % round(score,3) + "] " + docs[doc][0])




In [14]:
pred_labels = create_labeling_archive(classifier, tfidf, nyt)

# making sure that the added article is predicted as BUSINESS article, for the sake of the experiment
print(nyt[-1])
pred_labels[-1] = "BUSINESS"
print(pred_labels[-1])

('Trump announces new Economic Measure to address the high rate of unemployment in the US', 'BUSINESS')
BUSINESS


In [15]:
tfidf_archive, dictionary, vectors = create_archive(nyt, pred_labels)

**User 1, interested in BUSINESS, searches for "Trump" related articles**

In [16]:
query_archive(tfidf_archive, dictionary, vectors, nyt, pred_labels, ["BUSINESS"], "Trump")

[ Score = 0.500] Trump announces new Economic Measure to address the high rate of unemployment in the US
[ Score = 0.265] To Trump, ‘Leakers Are Traitors and Cowards,’ and He Wants to Find Them. President Trump and his son Barron last Sunday at Andrews Air Force Base near Washington. The White House has instituted a new system in order to crack down on unsanctioned “leaks” of Mr. Trump’s private daily schedule.First obsessed with members of the “deep state,” President Trump is now fixated on former officials who talked to investigators.
[ Score = 0.229] Overcoming Doubts, U.S. Economy Finds a Way Forward. Consumer spending, a bedrock of the recovery, was weak in the first quarter. But retail sales picked up in March, and economists expect stronger spending in the second quarter.A 3.2% growth rate in the first quarter beats the forecasts and offers Trump a political lift.
[ Score = 0.202] White House Memo: Believing Him: For Trump, Sticking With Men Like Stephen Moore Is Nothing New. Pr

**User 2, interested in POLITICS, searches for "Trump" related articles**

In [17]:
query_archive(tfidf_archive, dictionary, vectors, nyt, pred_labels, ["POLITICS"], "Trump")

[ Score = 0.795] To Trump, ‘Leakers Are Traitors and Cowards,’ and He Wants to Find Them. President Trump and his son Barron last Sunday at Andrews Air Force Base near Washington. The White House has instituted a new system in order to crack down on unsanctioned “leaks” of Mr. Trump’s private daily schedule.First obsessed with members of the “deep state,” President Trump is now fixated on former officials who talked to investigators.
[ Score = 0.607] White House Memo: Believing Him: For Trump, Sticking With Men Like Stephen Moore Is Nothing New. President Trump and Melania, the first lady, boarding Air Force One on Wednesday. Mr. Trump is often drawn to men who share his indiscretions.A president with his own troubled history has shown disdain for accusations of harassment, assault or just plain sexism against men who proclaim their innocence.
[ Score = 0.588] Trump Declares Commitment to Ending Opioid Crisis ‘Once and for All’. Many leading authorities on the opioid crisis have been c

As showed in the results, the ranking of search results is biased depending on the user profile by a factor of 1.5 if the article category is of interest of the user, otherwise the ranking score of the article gets a penalization by a factor of 0.5