# Profile Based Retrieval System

In [1]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import corpora, models, similarities
from operator import itemgetter
import nltk
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict
import pandas as pd



In [2]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

In [3]:
def count_category_samples(articles, category):
    counts = {}
    for name in category:
        counts[name] = 0
        
    for corpus, label in articles:
        counts[label] += 1
        
    return counts

In [4]:
def split_samples(samples, division): # consistent split
    
    samples_p = [(corpus, label) for (corpus, label) in samples if label == 'Politics']
    samples_b = [(corpus, label) for (corpus, label) in samples if label == 'Business']
    samples_t = [(corpus, label) for (corpus, label) in samples if label == 'Tech']
    samples_s = [(corpus, label) for (corpus, label) in samples if label == 'Science']
    samples_h = [(corpus, label) for (corpus, label) in samples if label == 'Health']
    samples_sp = [(corpus, label) for (corpus, label) in samples if label == 'Sports']
    samples_a = [(corpus, label) for (corpus, label) in samples if label == 'Arts']
    
    train_p = samples_p[:int(division*len(samples_p))]
    test_p = samples_p[int(division*len(samples_p)):]
    
    train_b = samples_b[:int(division*len(samples_b))]
    test_b = samples_b[int(division*len(samples_b)):]
    
    train_t = samples_t[:int(division*len(samples_t))]
    test_t = samples_t[int(division*len(samples_t)):]
    
    train_s = samples_s[:int(division*len(samples_s))]
    test_s = samples_s[int(division*len(samples_s)):]
    
    train_h = samples_h[:int(division*len(samples_h))]
    test_h = samples_h[int(division*len(samples_h)):]
    
    train_sp = samples_sp[:int(division*len(samples_sp))]
    test_sp = samples_sp[int(division*len(samples_sp)):]

    train_a = samples_a[:int(division*len(samples_a))]
    test_a = samples_a[int(division*len(samples_a)):]
    
    trainset = train_p + train_b + train_t + train_s + train_h + train_sp + train_a
    testset = test_p + test_b + test_t + test_s + test_h + test_sp + test_a
    
    return trainset, testset

## Information retrieval

This time retrieve from saved dataset

In [20]:
articles = [] # container for all the retrieved articles in the form corpus-category

In [21]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

In [22]:
def bbc_retrieve(rss_url, label): # retrieve from BBC
    articles = []
    
    http = urllib.PoolManager()
    r = http.request('GET', rss_url)
    
    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]
    
    for key in data:
        article = key
        title,descr = '', ''
        if 'title' in article and article['title'] is not None:
            title = article['title']+'. '
        if 'description' in article and article['description'] is not None:
            descr = article['description']

        corpus = str(title) + str(descr)
        articles.append(tuple((corpus, label)))
        
    return articles

In [23]:
def theguardian_retrieve(rss_url, label): # retrieve from TheGuardian
    articles = []
    
    http = urllib.PoolManager()
    r = http.request('GET', rss_url)
    
    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]
    
    for key in data:
        article = key
        title,descr = '', ''
        if 'title' in article and article['title'] is not None:
            title = article['title']
            
        corpus = str(title)
        articles.append(tuple((corpus, label)))
        
    return articles

In [24]:
articles += theguardian_retrieve('https://www.theguardian.com/politics/rss', 'Politics')
articles += theguardian_retrieve('https://www.theguardian.com/uk/business/rss', 'Business')
articles += theguardian_retrieve('https://www.theguardian.com/uk/technology/rss', 'Tech')
articles += theguardian_retrieve('https://www.theguardian.com/science/rss', 'Science')
articles += theguardian_retrieve('https://www.theguardian.com/lifeandstyle/health-and-wellbeing/rss', 'Health')
articles += theguardian_retrieve('https://www.theguardian.com/uk/sport/rss', 'Sports')
articles += theguardian_retrieve('https://www.theguardian.com/uk/culture/rss', 'Arts')

In [25]:
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/politics/rss.xml', 'Politics')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/business/rss.xml', 'Business')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/technology/rss.xml', 'Tech')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/science_and_environment/rss.xml', 'Science')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/health/rss.xml', 'Health')
articles += bbc_retrieve('http://feeds.bbci.co.uk/sport/rss.xml', 'Sports')
articles += bbc_retrieve('http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml', 'Arts')

In [26]:
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'Politics')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'Business')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 'Tech')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'Science')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'Health')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'Sports')
articles += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Arts.xml', 'Arts')

In [27]:
len(articles)

879

## Preprocessing
Corpus of the articles are cleaned applying stop words removal, PorterStemmer, tokenization.

In [28]:
p_articles = []
for corpus, title in articles:
    p_corpus = preprocess_document(corpus)
    p_articles.append(tuple((p_corpus, title)))

print(p_articles[1:3])
print(len(p_articles))

[(['farag', 'brexit', 'parti', 'use', 'poll', 'oust', 'remain', 'parliament'], 'Politics'), (['chang', 'elect', 'candid', 'step', 'offens', 'tweet'], 'Politics')]
879


## Feature extraction

### Create dictionary for Tf-Idf scores

In [29]:
#corpus_list = [corpus for (corpus, label) in p_articles]

#dictionary = corpora.Dictionary(corpus_list)

#print(len(dictionary))

# adjust no_below, no_above
#dictionary.filter_extremes(no_below = 5, no_above = 0.9, keep_n=2000)

#print(len(dictionary))

#dictionary.save('articles-dict.dict')
#dictionary.save_as_text('dict_preview.txt', sort_by_word=False)

In [30]:
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction.text import TfidfTransformer  

corpus_list = [" ".join(corpus) for (corpus, label) in p_articles]
y = [label for (corpus, label) in p_articles]

# integrity check
print(p_articles[0])
print(corpus_list[0], y[0])

(['sturgeon', 'outlin', 'new', 'scottish', 'independ', 'referendum', 'plan'], 'Politics')
sturgeon outlin new scottish independ referendum plan Politics


In [41]:
#vectorizer = CountVectorizer(max_features=2000, min_df=0, max_df=1, stop_words=stopwords.words('english'))  
#X = vectorizer.fit_transform(corpus_list).toarray()  

#tfidfconverter = TfidfTransformer()  
#X = tfidfconverter.fit_transform(X).toarray()  

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

X = tfidf.fit_transform(corpus_list).toarray()

X.shape

(879, 15169)

In [42]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

print(count_category_samples(zip(X_train, y_train), ['Politics','Business','Tech','Health','Science','Sports','Arts']))
print(count_category_samples(zip(X_test, y_test), ['Politics','Business','Tech','Health','Science','Sports','Arts']))

{'Politics': 105, 'Business': 128, 'Tech': 131, 'Health': 66, 'Science': 89, 'Sports': 75, 'Arts': 109}
{'Politics': 33, 'Business': 40, 'Tech': 25, 'Health': 14, 'Science': 16, 'Sports': 16, 'Arts': 32}


In [43]:
#print(X_train[1], y_train[1])

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier = LogisticRegression(solver='lbfgs', multi_class = 'auto')
classifier.fit(X_train, y_train)  

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [45]:
y_pred = classifier.predict(X_test) 

In [46]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[18  5  0  0  0  0  9]
 [ 0 23  0  3  2  0 12]
 [ 0 10  0  0  0  0  4]
 [ 1  8  0 20  0  0  4]
 [ 1  6  1  2  2  0  4]
 [ 0  8  0  0  0  3  5]
 [ 1 10  0  1  0  0 13]]
              precision    recall  f1-score   support

        Arts       0.86      0.56      0.68        32
    Business       0.33      0.57      0.42        40
      Health       0.00      0.00      0.00        14
    Politics       0.77      0.61      0.68        33
     Science       0.50      0.12      0.20        16
      Sports       1.00      0.19      0.32        16
        Tech       0.25      0.52      0.34        25

   micro avg       0.45      0.45      0.45       176
   macro avg       0.53      0.37      0.38       176
weighted avg       0.55      0.45      0.44       176

0.44886363636363635


In [None]:
count_category_samples(X_train, 'Politics')

### Docs2Bows

In [None]:
vectors = [dictionary.doc2bow(corpus) for (corpus,label) in p_articles]
corpora.MmCorpus.serialize('articles-vsm.mm', vectors)

### TF-IDF Scores

In [None]:
tfidf = models.TfidfModel(vectors)

import random

random.shuffle(p_articles)

featureset = []
for corpus, label in p_articles:
    vcorpus = dictionary.doc2bow(corpus)
    ctfidf = tfidf[vcorpus]
    feature_row = {}
    for i in range(0,len(dictionary)):
        feature_row[i] = 0
    
    for key,value in ctfidf:
        feature_row[key] = value
    featureset.append((feature_row, label))
    

In [None]:
len(featureset)

In [None]:
featureset

From this data we create train set (80%) and test set (20%)

In [None]:


trainset, testset = split_samples(featureset, 0.8)

## Naive Bayes Classifier

### TF-IDF Feature Set

In [None]:
bayes_classifier = nltk.NaiveBayesClassifier.train(trainset)
print("TF-IDF: "+str(nltk.classify.accuracy(bayes_classifier, testset)))
bayes_classifier.show_most_informative_features(5)

### Multinomial Feature Set

In [None]:
bayes_classifier_m = nltk.NaiveBayesClassifier.train(trainset_m)
print("M: "+str(nltk.classify.accuracy(bayes_classifier_m, testset_m)))
bayes_classifier_m.show_most_informative_features(5)

### Normalized Multinomial Feature Set

In [None]:
bayes_classifier_mn = nltk.NaiveBayesClassifier.train(trainset_mn)
print("MNorm: "+str(nltk.classify.accuracy(bayes_classifier_mn, testset_mn)))
bayes_classifier_mn.show_most_informative_features(5)

## Maximum Entropy Classification

### Multi-Variate Bernoulli Feature Set

In [None]:
maxent_classifier_mvb = nltk.classify.MaxentClassifier.train(trainset_mvb, 'IIS', trace=0, max_iter=5)
print("MVB: "+str(nltk.classify.accuracy(maxent_classifier_mvb, testset_mvb)))
maxent_classifier_mvb.show_most_informative_features(5)

### Multinomial Feature Set

In [None]:
maxent_classifier_m = nltk.classify.MaxentClassifier.train(trainset_m, 'IIS', trace=0, max_iter=5)
print("M: "+str(nltk.classify.accuracy(maxent_classifier_m, testset_m)))
maxent_classifier_m.show_most_informative_features(5)

### Normalized Multinomial Feature Set

In [None]:
maxent_classifier_mn = nltk.classify.MaxentClassifier.train(trainset_mn, 'IIS', trace=0, max_iter=5)
print("MNorm: "+str(nltk.classify.accuracy(maxent_classifier_mn, testset_mn)))
maxent_classifier_mn.show_most_informative_features(5)

## Random Forest Classifier

### Multi-Variate Bernoulli Feature Set

In [None]:
randforest_classifier_mvb = nltk.classify.DecisionTreeClassifier.train(trainset_mvb, entropy_cutoff=0, support_cutoff=0)
print("MVB: "+str(nltk.classify.accuracy(randforest_classifier_mvb, testset_mvb)))

### Multinomial Feature Set

In [None]:
randforest_classifier_m = nltk.classify.DecisionTreeClassifier.train(trainset_m, entropy_cutoff=0, support_cutoff=0)
print("M: "+str(nltk.classify.accuracy(randforest_classifier_m, testset_m)))

### Normalized Multinomial Feature Set

In [None]:
randforest_classifier_mn = nltk.classify.DecisionTreeClassifier.train(trainset_mn, entropy_cutoff=0, support_cutoff=0)
print("MNorm: "+str(nltk.classify.accuracy(randforest_classifier_mn, testset_mn)))

## Model Deployment
Since it is the best performing model among the ones I tried, I choose to use the Naive Bayes Classifier trained with Multi-Variate Bernoulli Feature Set to be used into the profile-based retrieval system.

In [None]:
import pickle
f = open('nbayes-mvb.pickle', 'wb')
pickle.dump(bayes_classifier_mvb, f)
f.close()

In [None]:
######### how to load a saved model? 
# import pickle
# f = open('my_classifier.pickle', 'rb')
# classifier = pickle.load(f)
# f.close()