# Profile-Based Retrieval System

## Model Loading

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

tfidf = pickle.load(open("model-tuning/tfidf_kaggledataset_scikitlearn-experiment.model", 'rb'))
classifier = pickle.load(open("model-tuning/log_classifier_scikitlearn-experiment.model", 'rb'))

## Users Dataset

In [2]:
from random import randint
import random

labels = ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']

def generate_user_dataset(n_users, max_preferences, labels):
    users = [] # id, list of preferred categories
    if max_preferences > len(labels):
        max_preferences = len(labels)
    
    for i in range(0, n_users):
        prefs = random.sample(range(0, 7), randint(1, max_preferences))
        prefs_mapped = [labels[k] for k in prefs]
        
        users.append(tuple((i,prefs_mapped)))
    
    return users
    
def extract_users_by_category(users_df, labels):
    labels_users = {label:[] for label in labels}
    
    for user, prefs in users_df:
        for label in prefs:
            labels_users[label]+=[user]
    
    return labels_users

In [3]:
users = generate_user_dataset(20, 5, labels)
users

[(0, ['POLITICS', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS']),
 (1, ['SCIENCE']),
 (2, ['SPORTS']),
 (3, ['BUSINESS']),
 (4, ['POLITICS', 'HEALTH', 'TRAVEL', 'SCIENCE']),
 (5, ['BUSINESS']),
 (6, ['POLITICS', 'SPORTS']),
 (7, ['SPORTS', 'TRAVEL', 'HEALTH', 'ENTERTAINMENT']),
 (8, ['TRAVEL', 'BUSINESS', 'SCIENCE', 'SPORTS', 'POLITICS']),
 (9, ['SCIENCE', 'BUSINESS', 'SPORTS', 'TRAVEL']),
 (10, ['SCIENCE', 'TRAVEL', 'POLITICS', 'SPORTS']),
 (11, ['ENTERTAINMENT', 'SPORTS', 'BUSINESS', 'POLITICS', 'SCIENCE']),
 (12, ['POLITICS', 'BUSINESS']),
 (13, ['SCIENCE', 'ENTERTAINMENT', 'SPORTS', 'BUSINESS', 'TRAVEL']),
 (14, ['HEALTH', 'POLITICS', 'SCIENCE', 'TRAVEL', 'SPORTS']),
 (15, ['BUSINESS', 'SCIENCE']),
 (16, ['ENTERTAINMENT', 'TRAVEL']),
 (17, ['BUSINESS', 'POLITICS', 'TRAVEL']),
 (18, ['SPORTS', 'POLITICS']),
 (19, ['POLITICS', 'ENTERTAINMENT', 'TRAVEL'])]

In [4]:
labels_users = extract_users_by_category(users, labels)
labels_users

{'POLITICS': [0, 4, 6, 8, 10, 11, 12, 14, 17, 18, 19],
 'ENTERTAINMENT': [0, 7, 11, 13, 16, 19],
 'HEALTH': [4, 7, 14],
 'TRAVEL': [4, 7, 8, 9, 10, 13, 14, 16, 17, 19],
 'BUSINESS': [3, 5, 8, 9, 11, 12, 13, 15, 17],
 'SPORTS': [0, 2, 6, 7, 8, 9, 10, 11, 13, 14, 18],
 'SCIENCE': [0, 1, 4, 8, 9, 10, 11, 13, 14, 15]}

## News Retrieval

In [5]:
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict

In [6]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

### Preprocessing

In [7]:
def preprocess_document(doc, join_words):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    
    if join_words == True:
        return " ".join(final)
    return final

## Use Case #1 - Profile-Based Newsletter

News are gathered from the New York Times website and delivered to interested users

In [8]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus, True),label))) 
    # label is known only for testing purposes, in real world scenario this information can be unavailable


In [20]:
def deliver_articles(classifier, tfidf, articles_df, original_articles, labels_users_df): # takes articles as input and label-users they like that label
    f_articles = tfidf.transform(articles_df).toarray()
    labeled_articles = classifier.predict(f_articles)
    
    i = 0
    for label in labeled_articles:
        print(original_articles[i],"-> DELIVERED TO ->",labels_users_df[label], "FANS OF ",label,"\n")
        i+=1

deliver_articles(classifier, tfidf, [article[0] for article in p_nyt], nyt, labels_users)
    

('Joe Biden Expresses Regret to Anita Hill, but She Says ‘I’m Sorry’ Is Not Enough. Anita Hill’s name was trending on Twitter on Thursday as many women said they supported her — and, by clear implication, not Joseph R. Biden Jr.Former Vice President Joseph R. Biden Jr. called Anita Hill this month to express his regret over “what she endured” testifying against Justice Clarence Thomas in 1991.', 'POLITICS') -> DELIVERED TO -> [0, 4, 6, 8, 10, 11, 12, 14, 17, 18, 19] FANS OF  POLITICS 

('Joe Biden Announces 2020 Run for President, After Months of Hesitation. A logo for Mr. Biden’s 2020 presidential run, left, and a campaign button from 2008.The former vice president has stayed on the sidelines while his record has been scrutinized. His entry is sure to reshape the Democratic primary contest.', 'POLITICS') -> DELIVERED TO -> [0, 4, 6, 8, 10, 11, 12, 14, 17, 18, 19] FANS OF  POLITICS 

('Strong Support Here Helped Trump Win Pennsylvania in 2016. 2020 Could Be Different.. President Trump 

### Performance Evaluation

In [18]:
X_unseen, y_unseen = [], []
for corpus, label in p_nyt:
    X_unseen.append(corpus)
    y_unseen.append(label)

X_unseen = tfidf.transform(X_unseen).toarray()

y_pred = classifier.predict(X_unseen)

In [19]:
print(confusion_matrix(y_unseen,y_pred))  
print(classification_report(y_unseen,y_pred))  
print(accuracy_score(y_unseen, y_pred))  

[[30  3  2  4  3  2  4]
 [ 2 23  2  0  0  0  2]
 [ 2  2 17  3  6  1  2]
 [ 1  0  0 19  0  0  0]
 [ 4  0  5  3 11  2  1]
 [ 0  1  0  0  0 17  2]
 [ 1  0  1  1  2  1 14]]
               precision    recall  f1-score   support

     BUSINESS       0.75      0.62      0.68        48
ENTERTAINMENT       0.79      0.79      0.79        29
       HEALTH       0.63      0.52      0.57        33
     POLITICS       0.63      0.95      0.76        20
      SCIENCE       0.50      0.42      0.46        26
       SPORTS       0.74      0.85      0.79        20
       TRAVEL       0.56      0.70      0.62        20

    micro avg       0.67      0.67      0.67       196
    macro avg       0.66      0.69      0.67       196
 weighted avg       0.67      0.67      0.66       196

0.6683673469387755


## Use Case #2 - Profile-Driven Search Engine for News Articles

In [None]:
# retrieve documents from newspaper
# preprocess them as showed by the examples of the teacher
# create tfidf model and dictionary from all those articles
# classify the articles in the corpus
# compute query of the user, how?
    # who is searching? take profile of the user that is writing the query
    # process the query
    # compute relevance score combining classical method + profile of the user
    # rank the results