# Profile-Based Retrieval System

## Model Loading

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

tfidf = pickle.load(open("model_building/tfidf_kaggledataset.model", 'rb'))
classifier = pickle.load(open("model_building/classifier.model", 'rb'))

## Users Dataset

In [3]:
from random import randint
import random

labels = ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']

def generate_user_dataset(n_users, max_preferences, labels):
    users = [] # id, list of preferred categories
    if max_preferences > len(labels):
        max_preferences = len(labels)
    
    for i in range(0, n_users):
        prefs = random.sample(range(0, 7), randint(1, max_preferences))
        prefs_mapped = [labels[k] for k in prefs]
        
        users.append(tuple((i,prefs_mapped)))
    
    return users
    
def extract_users_by_category(users_df, labels):
    labels_users = {label:[] for label in labels}
    
    for user, prefs in users_df:
        for label in prefs:
            labels_users[label]+=[user]
    
    return labels_users

In [4]:
users = generate_user_dataset(20, 5, labels)
users

[(0, ['ENTERTAINMENT', 'TRAVEL', 'HEALTH', 'BUSINESS', 'SPORTS']),
 (1, ['SPORTS', 'HEALTH']),
 (2, ['TRAVEL', 'HEALTH', 'SCIENCE', 'BUSINESS']),
 (3, ['HEALTH', 'SPORTS', 'POLITICS', 'TRAVEL', 'ENTERTAINMENT']),
 (4, ['ENTERTAINMENT', 'SPORTS']),
 (5, ['SCIENCE', 'HEALTH', 'POLITICS']),
 (6, ['TRAVEL', 'SPORTS', 'ENTERTAINMENT']),
 (7, ['POLITICS']),
 (8, ['SCIENCE', 'BUSINESS', 'POLITICS']),
 (9, ['ENTERTAINMENT', 'HEALTH', 'SPORTS']),
 (10, ['BUSINESS', 'ENTERTAINMENT']),
 (11, ['SPORTS']),
 (12, ['SCIENCE', 'TRAVEL']),
 (13, ['HEALTH', 'ENTERTAINMENT', 'SPORTS', 'SCIENCE']),
 (14, ['BUSINESS', 'HEALTH']),
 (15, ['ENTERTAINMENT', 'HEALTH', 'SPORTS', 'TRAVEL', 'SCIENCE']),
 (16, ['TRAVEL', 'POLITICS', 'SCIENCE']),
 (17, ['HEALTH']),
 (18, ['BUSINESS', 'SCIENCE']),
 (19, ['ENTERTAINMENT', 'SPORTS', 'BUSINESS', 'POLITICS', 'TRAVEL'])]

In [5]:
labels_users = extract_users_by_category(users, labels)
labels_users

{'POLITICS': [3, 5, 7, 8, 16, 19],
 'ENTERTAINMENT': [0, 3, 4, 6, 9, 10, 13, 15, 19],
 'HEALTH': [0, 1, 2, 3, 5, 9, 13, 14, 15, 17],
 'TRAVEL': [0, 2, 3, 6, 12, 15, 16, 19],
 'BUSINESS': [0, 2, 8, 10, 14, 18, 19],
 'SPORTS': [0, 1, 3, 4, 6, 9, 11, 13, 15, 19],
 'SCIENCE': [2, 5, 8, 12, 13, 15, 16, 18]}

## News Retrieval

In [6]:
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict

In [7]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

### Preprocessing

In [8]:
def preprocess_document(doc, join_words):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    
    if join_words == True:
        return " ".join(final)
    return final

## Use Case #1 - Profile-Based Newsletter

News are gathered from the New York Times website and delivered to interested users

In [9]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus, True),label))) 
    # label is known only for testing purposes, in real world scenario this information can be unavailable


In [10]:
def deliver_articles(classifier, tfidf, articles_df, original_articles, labels_users_df): # takes articles as input and label-users they like that label
    f_articles = tfidf.transform(articles_df).toarray()
    labeled_articles = classifier.predict(f_articles)
    
    i = 0
    for label in labeled_articles:
        print(original_articles[i],"-> DELIVERED TO ->",labels_users_df[label], "FANS OF ",label,"\n")
        i+=1

deliver_articles(classifier, tfidf, [article[0] for article in p_nyt], nyt, labels_users)
    

('The Long Run: Biden and Obama’s ‘Odd Couple’ Relationship Aged Into Family Ties. President-elect Barack Obama and Vice President-elect Joseph R. Biden Jr. in 2008 in Chicago after winning that year’s presidential election.Four years ago, Barack Obama gently discouraged him from running for president, but Joseph R. Biden Jr. is back and presenting himself as Mr. Obama’s natural heir.', 'POLITICS') -> DELIVERED TO -> [3, 5, 7, 8, 16, 19] FANS OF  POLITICS 

('In Pennsylvania, Joe Biden Finds Support Where He Most Needs It. From left, Kevin Frantz, Ciarra Walker, Nasya Jenkins and Sarah Tannenbaum make up a coalition of voters that Joseph R. Biden Jr. will need to win the Democratic nomination for president.Despite doubts about his candidacy, Joe Biden begins the race with substantial support from three key constituencies in his native state, a place the Democrats can’t afford to lose again.', 'POLITICS') -> DELIVERED TO -> [3, 5, 7, 8, 16, 19] FANS OF  POLITICS 

('Joe Biden Called Hea

### Performance Evaluation

In [11]:
X_unseen, y_unseen = [], []
for corpus, label in p_nyt:
    X_unseen.append(corpus)
    y_unseen.append(label)

X_unseen = tfidf.transform(X_unseen).toarray()

y_pred = classifier.predict(X_unseen)

In [12]:
print(confusion_matrix(y_unseen,y_pred))  
print(classification_report(y_unseen,y_pred))  
print(accuracy_score(y_unseen, y_pred))  

[[33  1  2  4  3  2  3]
 [ 3 23  2  1  0  5  3]
 [ 5  0  6  2  0  0  0]
 [ 0  1  0 16  0  1  2]
 [ 5  0  5  6  8  1  2]
 [ 0  0  0  0  0 19  1]
 [ 0  0  1  1  0  0 18]]
               precision    recall  f1-score   support

     BUSINESS       0.72      0.69      0.70        48
ENTERTAINMENT       0.92      0.62      0.74        37
       HEALTH       0.38      0.46      0.41        13
     POLITICS       0.53      0.80      0.64        20
      SCIENCE       0.73      0.30      0.42        27
       SPORTS       0.68      0.95      0.79        20
       TRAVEL       0.62      0.90      0.73        20

    micro avg       0.66      0.66      0.66       185
    macro avg       0.65      0.67      0.64       185
 weighted avg       0.70      0.66      0.66       185

0.6648648648648648


## Use Case #2 - Profile-Driven Search Engine for News Articles

In [None]:
# retrieve documents from newspaper
# preprocess them as showed by the examples of the teacher
# create tfidf model and dictionary from all those articles
# classify the articles in the corpus
# compute query of the user, how?
    # who is searching? take profile of the user that is writing the query
    # process the query
    # compute relevance score combining classical method + profile of the user
    # rank the results

In [13]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

# experiment
nyt.append(tuple(('Trump announces new Economic Measure to address the high rate of unemployment in the US', 'BUSINESS')))

p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus, False),label))) 
    # label is known only for testing purposes, in real world scenario this information can be unavailable

In [14]:
from gensim import corpora, models, similarities
from operator import itemgetter

def create_labeling_archive(classifier, tfidf, docs): # the tfidf mentioned here is the one obtained during model building
    p_docs = [preprocess_document(corpus, True) for corpus, label in docs]
    X = tfidf.transform(p_docs).toarray()
    y = classifier.predict(X)
    return y

def create_archive(docs, pred_labels):
    p_docs = [preprocess_document(corpus, False) for corpus, label in docs]
    # create dictionary
    dictionary = corpora.Dictionary(p_docs)
    dictionary.save('vsm.dict') # save dictionary
    # docs2bows
    vectors = [dictionary.doc2bow(doc) for doc in p_docs]
    corpora.MmCorpus.serialize('vsm_docs.mm', vectors) # save bows
    # tfidf scores of the archive
    tfidf = models.TfidfModel(vectors)
    tfidf.save('tfidf_archive.scores')
    
    return tfidf, dictionary, vectors

def query_archive(tfidf_archive, dictionary, vectors, docs, pred_labels, user_prefs, q):
    pq = preprocess_document(q, False)
    vq = dictionary.doc2bow(pq)
    
    qtfidf = tfidf_archive[vq]
    
    index = similarities.MatrixSimilarity(vectors, num_features=len(dictionary))
    sim = index[qtfidf]
    sim = enumerate(sim)

    fix_factor = 1.5 # multiplier if the label is part of user preferences
    penalty_factor = 0.5
    fix_sim = []
    for doc, score in sim:
        if pred_labels[doc] in user_prefs:
            fix_sim.append(score * fix_factor)
        else:
            fix_sim.append(score * penalty_factor)

    ranking = sorted(enumerate(fix_sim), key=itemgetter(1), reverse=True)

    for doc, score in ranking:
        print("[ Score = " + "%.3f" % round(score,3) + "] " + docs[doc][0])




In [15]:
pred_labels = create_labeling_archive(classifier, tfidf, nyt)

# making sure that the added article is predicted as BUSINESS article, for the sake of the experiment
print(nyt[-1])
pred_labels[-1] = "BUSINESS"
print(pred_labels[-1])

('Trump announces new Economic Measure to address the high rate of unemployment in the US', 'BUSINESS')
BUSINESS


In [16]:
tfidf_archive, dictionary, vectors = create_archive(nyt, pred_labels)

**User 1, interested in BUSINESS, searches for "Trump" related articles**

In [17]:
query_archive(tfidf_archive, dictionary, vectors, nyt, pred_labels, ["BUSINESS"], "Trump")

[ Score = 0.500] Trump announces new Economic Measure to address the high rate of unemployment in the US
[ Score = 0.265] To Trump, ‘Leakers Are Traitors and Cowards,’ and He Wants to Find Them. President Trump and his son Barron last Sunday at Andrews Air Force Base near Washington. The White House has instituted a new system in order to crack down on unsanctioned “leaks” of Mr. Trump’s private daily schedule.First obsessed with members of the “deep state,” President Trump is now fixated on former officials who talked to investigators.
[ Score = 0.229] Overcoming Doubts, U.S. Economy Finds a Way Forward. Consumer spending, a bedrock of the recovery, was weak in the first quarter. But retail sales picked up in March, and economists expect stronger spending in the second quarter.A 3.2% growth rate in the first quarter beats the forecasts and offers Trump a political lift.
[ Score = 0.195] Even Away From Correspondents’ Dinner, Trump Makes Sure to Have His Say. President Trump on Saturda

**User 2, interested in POLITICS, searches for "Trump" related articles**

In [18]:
query_archive(tfidf_archive, dictionary, vectors, nyt, pred_labels, ["POLITICS"], "Trump")

[ Score = 0.795] To Trump, ‘Leakers Are Traitors and Cowards,’ and He Wants to Find Them. President Trump and his son Barron last Sunday at Andrews Air Force Base near Washington. The White House has instituted a new system in order to crack down on unsanctioned “leaks” of Mr. Trump’s private daily schedule.First obsessed with members of the “deep state,” President Trump is now fixated on former officials who talked to investigators.
[ Score = 0.586] Even Away From Correspondents’ Dinner, Trump Makes Sure to Have His Say. President Trump on Saturday at a rally in Green Bay, Wis. “Now you finally have a president that is loyal to you,” Mr. Trump said, mostly reserving mockery for his prospective opponents.Thinly veiled rebukes of the White House Correspondents’ Dinner was a running theme of a campaign rally in Wisconsin, where the president also zeroed in on policy.
[ Score = 0.576] Trump Accuses Saudis of Giving U.S. a Bad Deal. Is That True?. President Trump at a rally in Green Bay, W

As showed in the results, the ranking of search results is biased depending on the user profile by a factor of 1.5 if the article category is of interest of the user, otherwise the ranking score of the article gets a penalization by a factor of 0.5