In [1]:
import json

with open('../../material/data/news_dataset.json') as json_file:  
    data = json.load(json_file)

In [2]:
data = data['articles']

In [3]:
from datetime import datetime


#allowed_categories = ['POLITICS', 'TRAVEL', 'SPORTS', 'RELIGION', 'SCIENCE', 'TECH', 'ARTS']
allowed_categories = {'POLITICS': 0, 'ENTERTAINMENT': 0, 'HEALTHY LIVING': 0, 'TRAVEL': 0, 'BUSINESS': 0, 'SPORTS':0, 'SCIENCE': 0}

#max_per_cat = 1000

filtered_data = []
filter_date = datetime.strptime('2017-08-01', "%Y-%m-%d")

for dct in data:
    if dct['category'] in allowed_categories:
        datetime_object = datetime.strptime(dct['date'], '%Y-%m-%d')
        if dct['category'] in ['POLITICS', 'ENTERTAINMENT']:
            if datetime_object >= filter_date:
                filtered_data.append(dct)
                allowed_categories[dct['category']]+=1
        else:
            filtered_data.append(dct)
            allowed_categories[dct['category']]+=1

In [4]:
allowed_categories

{'POLITICS': 7049,
 'ENTERTAINMENT': 3058,
 'HEALTHY LIVING': 6694,
 'TRAVEL': 9887,
 'BUSINESS': 5937,
 'SPORTS': 4884,
 'SCIENCE': 2178}

In [5]:
import random

politics = []
entertainment = []
health = []
travel = []
business = []
sports = []
science = []

for dct in filtered_data:
    if dct['category'] == "POLITICS":
        politics.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "POLITICS")))
    if dct['category'] == "ENTERTAINMENT":
        entertainment.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "ENTERTAINMENT")))
    if dct['category'] == "HEALTHY LIVING":
        health.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "HEALTH")))
    if dct['category'] == "TRAVEL":
        travel.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "TRAVEL")))
    if dct['category'] == "BUSINESS":
        business.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "BUSINESS")))
    if dct['category'] == "SPORTS":
        sports.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SPORTS")))
    if dct['category'] == "SCIENCE":
        science.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SCIENCE")))
        
max_samples = 2000
politics = random.sample(politics, max_samples)
entertainment = random.sample(entertainment, max_samples)
health = random.sample(health, max_samples)
travel = random.sample(travel, max_samples)
business = random.sample(business, max_samples)
sports = random.sample(sports, max_samples)
science = random.sample(science, max_samples)

articles = politics + entertainment + health + travel + business + sports + science

In [6]:
def count_samples_per_cat(samples, cats):
    counts = {}
    for name in cats:
        counts[name] = 0
    
    for corpus, label in samples:
        counts[label]+=1
        
    return counts

In [7]:
print(count_samples_per_cat(articles, ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 2000, 'ENTERTAINMENT': 2000, 'HEALTH': 2000, 'TRAVEL': 2000, 'BUSINESS': 2000, 'SPORTS': 2000, 'SCIENCE': 2000}


### Preprocessing

In [8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

In [9]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return " ".join(final)

In [10]:
p_articles = []
for corpus, label in articles:
    p_corpus = preprocess_document(corpus)
    p_articles.append(tuple((p_corpus, label)))

### Feature Extraction

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import array

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

X = array([corpus for corpus, label in articles])
y = [label for corpus, label in articles]

### Train Test Split

In [60]:
from sklearn.model_selection import train_test_split  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)  

In [61]:
print(count_samples_per_cat(zip(X_train, y_train), ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))
print(count_samples_per_cat(zip(X_test, y_test), ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 1599, 'ENTERTAINMENT': 1620, 'HEALTH': 1612, 'TRAVEL': 1596, 'BUSINESS': 1605, 'SPORTS': 1579, 'SCIENCE': 1589}
{'POLITICS': 401, 'ENTERTAINMENT': 380, 'HEALTH': 388, 'TRAVEL': 404, 'BUSINESS': 395, 'SPORTS': 421, 'SCIENCE': 411}


### Model Building - Multinomial Naive Bayes

This is the final model chosen to be integrated with the application

In [62]:
from sklearn.naive_bayes import MultinomialNB

mnb_classifier = MultinomialNB()

X_train = tfidf.fit_transform(X_train).toarray()

mnb_classifier.fit(X_train, y_train)  

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [63]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X_test = tfidf.transform(X_test).toarray()

y_pred = mnb_classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[288   9  47  23  11   5  12]
 [  2 300  12  31   7  15  13]
 [ 23  13 305   8  13   4  22]
 [ 25  10  12 340   6   6   2]
 [ 13   8  54   5 306   9  16]
 [  3  27  14   7   5 361   4]
 [ 11  10  18   4   8   4 349]]
               precision    recall  f1-score   support

     BUSINESS       0.79      0.73      0.76       395
ENTERTAINMENT       0.80      0.79      0.79       380
       HEALTH       0.66      0.79      0.72       388
     POLITICS       0.81      0.85      0.83       401
      SCIENCE       0.86      0.74      0.80       411
       SPORTS       0.89      0.86      0.88       421
       TRAVEL       0.83      0.86      0.85       404

    micro avg       0.80      0.80      0.80      2800
    macro avg       0.81      0.80      0.80      2800
 weighted avg       0.81      0.80      0.80      2800

0.8032142857142858


## Model Deployment

In [64]:
import pickle
pickle.dump(mnb_classifier, open("classifier.model", 'wb'))
pickle.dump(tfidf, open("tfidf_kaggledataset.model",'wb'))

## Unseen Data

In [65]:
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict

In [66]:
print("Classes used: ",'POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE')

Classes used:  POLITICS ENTERTAINMENT HEALTH TRAVEL BUSINESS SPORTS SCIENCE


In [67]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

In [68]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

In [69]:
p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus),label)))

In [71]:
X_unseen, y_unseen = [], []
for corpus, label in p_nyt:
    X_unseen.append(corpus)
    y_unseen.append(label)

X_unseen = tfidf.transform(X_unseen).toarray()

y_pred = mnb_classifier.predict(X_unseen)

In [72]:
print(confusion_matrix(y_unseen,y_pred))  
print(classification_report(y_unseen,y_pred))  
print(accuracy_score(y_unseen, y_pred))  

[[33  1  2  4  3  2  3]
 [ 3 23  2  1  0  5  3]
 [ 5  0  6  2  0  0  0]
 [ 0  1  0 16  0  1  2]
 [ 5  0  5  6  8  1  2]
 [ 0  0  0  0  0 19  1]
 [ 0  0  1  1  0  0 18]]
               precision    recall  f1-score   support

     BUSINESS       0.72      0.69      0.70        48
ENTERTAINMENT       0.92      0.62      0.74        37
       HEALTH       0.38      0.46      0.41        13
     POLITICS       0.53      0.80      0.64        20
      SCIENCE       0.73      0.30      0.42        27
       SPORTS       0.68      0.95      0.79        20
       TRAVEL       0.62      0.90      0.73        20

    micro avg       0.66      0.66      0.66       185
    macro avg       0.65      0.67      0.64       185
 weighted avg       0.70      0.66      0.66       185

0.6648648648648648
