# Profile Based Retrieval System

In [1]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora, models, similarities
from operator import itemgetter
import nltk
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict



In [2]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

In [3]:
def count_category_samples(articles, category):
    count = 0
    for corpus, label in articles:
        if category == label:
            count += 1
    return count

In [4]:
def split_samples(samples, division): # consistent split
    
    samples_p = [(corpus, label) for (corpus, label) in samples if label == 'Politics']
    samples_b = [(corpus, label) for (corpus, label) in samples if label == 'Business']
    samples_t = [(corpus, label) for (corpus, label) in samples if label == 'Tech']
    samples_s = [(corpus, label) for (corpus, label) in samples if label == 'Science']
    samples_h = [(corpus, label) for (corpus, label) in samples if label == 'Health']
    samples_sp = [(corpus, label) for (corpus, label) in samples if label == 'Sports']
    samples_a = [(corpus, label) for (corpus, label) in samples if label == 'Arts']
    
    train_p = samples_p[:int(division*len(samples_p))]
    test_p = samples_p[int(division*len(samples_p)):]
    
    train_b = samples_b[:int(division*len(samples_b))]
    test_b = samples_b[int(division*len(samples_b)):]
    
    train_t = samples_t[:int(division*len(samples_t))]
    test_t = samples_t[int(division*len(samples_t)):]
    
    train_s = samples_s[:int(division*len(samples_s))]
    test_s = samples_s[int(division*len(samples_s)):]
    
    train_h = samples_h[:int(division*len(samples_h))]
    test_h = samples_h[int(division*len(samples_h)):]
    
    train_sp = samples_sp[:int(division*len(samples_sp))]
    test_sp = samples_sp[int(division*len(samples_sp)):]

    train_a = samples_a[:int(division*len(samples_a))]
    test_a = samples_a[int(division*len(samples_a)):]
    
    trainset = train_p + train_b + train_t + train_s + train_h + train_sp + train_a
    testset = test_p + test_b + test_t + test_s + test_h + test_sp + test_a
    
    return trainset, testset

## Information retrieval

In [5]:
articles = [] # container for all the retrieved articles in the form corpus-category

In [6]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

In [7]:
def bbc_retrieve(rss_url, label): # retrieve from BBC
    articles = []
    
    http = urllib.PoolManager()
    r = http.request('GET', rss_url)
    
    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]
    
    for key in data:
        article = key
        title,descr = '', ''
        if 'title' in article and article['title'] is not None:
            title = article['title']+'. '
        if 'description' in article and article['description'] is not None:
            descr = article['description']

        corpus = str(title) + str(descr)
        articles.append(tuple((corpus, label)))
        
    return articles

In [8]:
def theguardian_retrieve(rss_url, label): # retrieve from TheGuardian
    articles = []
    
    http = urllib.PoolManager()
    r = http.request('GET', rss_url)
    
    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]
    
    for key in data:
        article = key
        title,descr = '', ''
        if 'title' in article and article['title'] is not None:
            title = article['title']
            
        corpus = str(title)
        articles.append(tuple((corpus, label)))
        
    return articles

In [9]:
articles += theguardian_retrieve('https://www.theguardian.com/politics/rss', 'Politics')
articles += theguardian_retrieve('https://www.theguardian.com/uk/business/rss', 'Business')
articles += theguardian_retrieve('https://www.theguardian.com/uk/technology/rss', 'Tech')
articles += theguardian_retrieve('https://www.theguardian.com/science/rss', 'Science')
articles += theguardian_retrieve('https://www.theguardian.com/lifeandstyle/health-and-wellbeing/rss', 'Health')
articles += theguardian_retrieve('https://www.theguardian.com/uk/sport/rss', 'Sports')
articles += theguardian_retrieve('https://www.theguardian.com/uk/culture/rss', 'Arts')

In [10]:
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/politics/rss.xml', 'Politics')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/business/rss.xml', 'Business')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/technology/rss.xml', 'Tech')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/science_and_environment/rss.xml', 'Science')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/health/rss.xml', 'Health')
articles += bbc_retrieve('http://feeds.bbci.co.uk/sport/rss.xml', 'Sports')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml', 'Arts')

In [11]:
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'Politics')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'Business')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 'Tech')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'Science')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'Health')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'Sports')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Arts.xml', 'Arts')

In [12]:
print(len(articles))

866


In [13]:
import pandas as pd
df = pd.DataFrame(articles, columns=['corpus', 'label'])

print(df.head())

df.to_csv('articles20190424.csv')

                                              corpus     label
0  Labour says Theresa May unwilling to offer key...  Politics
1  Brexit: Theresa May's approval ratings with To...  Politics
2  Ann Widdecombe stands for Farage's Brexit part...  Politics
3  Greta Thunberg condemns UK's climate stance in...  Politics
4  Trump baby blimp back and could be even bigger...  Politics


In [14]:
#df.isna().sum()
(df['label'].values == '').sum()

0

## Preprocessing
Corpus of the articles are cleaned applying stop words removal, PorterStemmer, tokenization.

In [15]:
p_articles = []
for corpus, title in articles:
    p_corpus = preprocess_document(corpus)
    p_articles.append(tuple((p_corpus, title)))

print(p_articles[1:3])
print(len(p_articles))

[(['brexit', 'theresa', 'may', 'approv', 'rate', 'tori', 'member', 'hit', 'record', 'low', 'survey', 'suggest', 'live', 'news'], 'Politics'), (['ann', 'widdecomb', 'stand', 'farag', 'brexit', 'parti', 'european', 'elect'], 'Politics')]
866


## Feature extraction

### Create dictionary for Multi-Variate Bernoulli, Multinomial and Normalized Multinomial feature sets

In [16]:
from nltk import FreqDist

dict_from_articles = [] # list of all tokens contained in the whole set of articles
for tokens, label in p_articles:
    dict_from_articles = dict_from_articles + tokens

print(len(dict_from_articles))
fdist = FreqDist(dict_from_articles) # compute frequency distribution

print(fdist)
topK = fdist.most_common(1000)

dictionary = []
for word, count in topK:
    dictionary.append(word)
    
print(len(dictionary))
print(dictionary[1:5])

15306
<FreqDist with 4344 samples and 15306 outcomes>
1000
['say', 'brexit', 'year', 'time']


### Create dictionary for Tf-Idf scores

In [17]:
def extract_MVB_features(tokens):
    feature_vec = {}
    
    for word in dictionary:
        if word in tokens:
            feature_vec[word] = 1
        else:
            feature_vec[word] = 0
            
    return feature_vec

def extract_M_features(tokens):
    feature_vec = {}
    freqs = FreqDist(tokens)
        
    for word in dictionary:
        if word in freqs: # if word appears in the phrase
            feature_vec[word] = freqs[word]
        else:
            feature_vec[word] = 0
            
    return feature_vec

def extract_MNorm_features(tokens):
    feature_vec = {}
    freqs = FreqDist(tokens)
    div = len(tokens)
    
    for word in dictionary:
        if word in freqs: # if word appears in the phrase
            feature_vec[word] = freqs[word]
        else:
            feature_vec[word] = 0
        feature_vec[word] = round(feature_vec[word]/div,2)
            
    return feature_vec



From this data we create train set (80%) and test set (20%)

In [18]:
import random
random.shuffle(p_articles)

print(count_category_samples(p_articles, 'Politics'))
print(count_category_samples(p_articles, 'Business'))
print(count_category_samples(p_articles, 'Tech'))
print(count_category_samples(p_articles, 'Science'))
print(count_category_samples(p_articles, 'Health'))
print(count_category_samples(p_articles, 'Sports'))
print(count_category_samples(p_articles, 'Arts'))

featuresets_mvb = [(extract_MVB_features(corpus), label) for (corpus, label) in p_articles]
featuresets_m = [(extract_M_features(corpus), label) for (corpus, label) in p_articles]
featuresets_mn = [(extract_MNorm_features(corpus), label) for (corpus, label) in p_articles]

137
163
151
106
83
92
134


In [19]:
trainset_mvb, testset_mvb = split_samples(featuresets_mvb, 0.8)
trainset_m, testset_m = split_samples(featuresets_m, 0.8)
trainset_mn, testset_mn = split_samples(featuresets_mn, 0.8)

## Naive Bayes Classifier

### Multi-Variate Bernoulli Feature Set

In [20]:
bayes_classifier_mvb = nltk.NaiveBayesClassifier.train(trainset_mvb)
print("MVB: "+str(nltk.classify.accuracy(bayes_classifier_mvb, testset_mvb)))
bayes_classifier_mvb.show_most_informative_features(5)

MVB: 0.536723163841808
Most Informative Features
                  health = 1              Health : Busine =     15.0 : 1.0
                  brexit = 1              Politi : Busine =     14.8 : 1.0
                     may = 1              Politi : Busine =     14.7 : 1.0
                  review = 1                Arts : Busine =     14.2 : 1.0
                     win = 1              Sports : Politi =     13.4 : 1.0


### Multinomial Feature Set

In [21]:
bayes_classifier_m = nltk.NaiveBayesClassifier.train(trainset_m)
print("M: "+str(nltk.classify.accuracy(bayes_classifier_m, testset_m)))
bayes_classifier_m.show_most_informative_features(5)

M: 0.4858757062146893
Most Informative Features
                  review = 1                Arts : Busine =     13.3 : 1.0
                     may = 1              Politi : Busine =     13.1 : 1.0
                     win = 1              Sports : Politi =     12.4 : 1.0
                  health = 1              Health : Tech   =     11.4 : 1.0
                    citi = 1              Sports : Busine =     11.2 : 1.0


### Normalized Multinomial Feature Set

In [22]:
bayes_classifier_mn = nltk.NaiveBayesClassifier.train(trainset_mn)
print("MNorm: "+str(nltk.classify.accuracy(bayes_classifier_mn, testset_mn)))
bayes_classifier_mn.show_most_informative_features(5)

MNorm: 0.3107344632768362
Most Informative Features
                     may = 0.06           Health : Busine =      5.7 : 1.0
                   first = 0.03           Sports : Busine =      5.2 : 1.0
                   three = 0.03           Sports : Tech   =      4.8 : 1.0
                 product = 0.05           Health : Busine =      4.5 : 1.0
                   could = 0.06           Health : Busine =      4.4 : 1.0


## Maximum Entropy Classification

### Multi-Variate Bernoulli Feature Set

In [None]:
maxent_classifier_mvb = nltk.classify.MaxentClassifier.train(trainset_mvb, 'IIS', trace=0, max_iter=5)
print("MVB: "+str(nltk.classify.accuracy(maxent_classifier_mvb, testset_mvb)))
maxent_classifier_mvb.show_most_informative_features(5)

### Multinomial Feature Set

In [None]:
maxent_classifier_m = nltk.classify.MaxentClassifier.train(trainset_m, 'IIS', trace=0, max_iter=5)
print("M: "+str(nltk.classify.accuracy(maxent_classifier_m, testset_m)))
maxent_classifier_m.show_most_informative_features(5)

### Normalized Multinomial Feature Set

In [None]:
maxent_classifier_mn = nltk.classify.MaxentClassifier.train(trainset_mn, 'IIS', trace=0, max_iter=5)
print("MNorm: "+str(nltk.classify.accuracy(maxent_classifier_mn, testset_mn)))
maxent_classifier_mn.show_most_informative_features(5)

## Random Forest Classifier

### Multi-Variate Bernoulli Feature Set

In [None]:
randforest_classifier_mvb = nltk.classify.DecisionTreeClassifier.train(trainset_mvb, entropy_cutoff=0, support_cutoff=0)
print("MVB: "+str(nltk.classify.accuracy(randforest_classifier_mvb, testset_mvb)))

### Multinomial Feature Set

In [None]:
randforest_classifier_m = nltk.classify.DecisionTreeClassifier.train(trainset_m, entropy_cutoff=0, support_cutoff=0)
print("M: "+str(nltk.classify.accuracy(randforest_classifier_m, testset_m)))

### Normalized Multinomial Feature Set

In [None]:
randforest_classifier_mn = nltk.classify.DecisionTreeClassifier.train(trainset_mn, entropy_cutoff=0, support_cutoff=0)
print("MNorm: "+str(nltk.classify.accuracy(randforest_classifier_mn, testset_mn)))

## Model Deployment
Since it is the best performing model among the ones I tried, I choose to use the Naive Bayes Classifier trained with Multi-Variate Bernoulli Feature Set to be used into the profile-based retrieval system.

In [None]:
import pickle
f = open('nbayes-mvb.pickle', 'wb')
pickle.dump(bayes_classifier_mvb, f)
f.close()

In [None]:
######### how to load a saved model? 
# import pickle
# f = open('my_classifier.pickle', 'rb')
# classifier = pickle.load(f)
# f.close()