In [1]:
import json

with open('../../material/data/news_dataset.json') as json_file:  
    data = json.load(json_file)

In [2]:
data = data['articles']

In [3]:
from datetime import datetime


#allowed_categories = ['POLITICS', 'TRAVEL', 'SPORTS', 'RELIGION', 'SCIENCE', 'TECH', 'ARTS']
allowed_categories = {'POLITICS': 0, 'ENTERTAINMENT': 0, 'HEALTHY LIVING': 0, 'TRAVEL': 0, 'BUSINESS': 0, 'SPORTS':0, 'SCIENCE': 0}

#max_per_cat = 1000

filtered_data = []
filter_date = datetime.strptime('2017-08-01', "%Y-%m-%d")

for dct in data:
    if dct['category'] in allowed_categories:
        datetime_object = datetime.strptime(dct['date'], '%Y-%m-%d')
        if dct['category'] in ['POLITICS', 'ENTERTAINMENT']:
            if datetime_object >= filter_date:
                filtered_data.append(dct)
                allowed_categories[dct['category']]+=1
        else:
            filtered_data.append(dct)
            allowed_categories[dct['category']]+=1

In [4]:
allowed_categories

{'POLITICS': 7049,
 'ENTERTAINMENT': 3058,
 'HEALTHY LIVING': 6694,
 'TRAVEL': 9887,
 'BUSINESS': 5937,
 'SPORTS': 4884,
 'SCIENCE': 2178}

In [5]:
import random

politics = []
entertainment = []
health = []
travel = []
business = []
sports = []
science = []

for dct in filtered_data:
    if dct['category'] == "POLITICS":
        politics.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "POLITICS")))
    if dct['category'] == "ENTERTAINMENT":
        entertainment.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "ENTERTAINMENT")))
    if dct['category'] == "HEALTHY LIVING":
        health.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "HEALTH")))
    if dct['category'] == "TRAVEL":
        travel.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "TRAVEL")))
    if dct['category'] == "BUSINESS":
        business.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "BUSINESS")))
    if dct['category'] == "SPORTS":
        sports.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SPORTS")))
    if dct['category'] == "SCIENCE":
        science.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SCIENCE")))
        
max_samples = 2000
politics = random.sample(politics, max_samples)
entertainment = random.sample(entertainment, max_samples)
health = random.sample(health, max_samples)
travel = random.sample(travel, max_samples)
business = random.sample(business, max_samples)
sports = random.sample(sports, max_samples)
science = random.sample(science, max_samples)

articles = politics + entertainment + health + travel + business + sports + science

In [6]:
def count_samples_per_cat(samples, cats):
    counts = {}
    for name in cats:
        counts[name] = 0
    
    for corpus, label in samples:
        counts[label]+=1
        
    return counts

In [7]:
print(count_samples_per_cat(articles, ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 2000, 'ENTERTAINMENT': 2000, 'HEALTH': 2000, 'TRAVEL': 2000, 'BUSINESS': 2000, 'SPORTS': 2000, 'SCIENCE': 2000}


### Preprocessing

In [8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

In [9]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return " ".join(final)

In [10]:
p_articles = []
for corpus, label in articles:
    p_corpus = preprocess_document(corpus)
    p_articles.append(tuple((p_corpus, label)))

### Feature Extraction

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

X = [corpus for corpus, label in articles]
y = [label for corpus, label in articles]

X = tfidf.fit_transform(X).toarray()
X.shape

(14000, 8603)

### Train Test splitting

In [15]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [16]:
print(count_samples_per_cat(zip(X_train, y_train), ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))
print(count_samples_per_cat(zip(X_test, y_test), ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 1619, 'ENTERTAINMENT': 1628, 'HEALTH': 1606, 'TRAVEL': 1580, 'BUSINESS': 1591, 'SPORTS': 1578, 'SCIENCE': 1598}
{'POLITICS': 381, 'ENTERTAINMENT': 372, 'HEALTH': 394, 'TRAVEL': 420, 'BUSINESS': 409, 'SPORTS': 422, 'SCIENCE': 402}


### Model Building - Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

log_classifier = LogisticRegression(solver='lbfgs', multi_class = 'auto')
log_classifier.fit(X_train, y_train)  



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
y_pred = log_classifier.predict(X_test) 

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[292   7  56  26   7   8  13]
 [  7 313   8  27   6   6   5]
 [ 29  14 293  16  21   9  12]
 [ 17   9  12 327   1   7   8]
 [ 15  12  57   1 304   7   6]
 [ 11  23  14   8   6 354   6]
 [ 17  10  17   3  20   7 346]]
               precision    recall  f1-score   support

     BUSINESS       0.75      0.71      0.73       409
ENTERTAINMENT       0.81      0.84      0.82       372
       HEALTH       0.64      0.74      0.69       394
     POLITICS       0.80      0.86      0.83       381
      SCIENCE       0.83      0.76      0.79       402
       SPORTS       0.89      0.84      0.86       422
       TRAVEL       0.87      0.82      0.85       420

    micro avg       0.80      0.80      0.80      2800
    macro avg       0.80      0.80      0.80      2800
 weighted avg       0.80      0.80      0.80      2800

0.7960714285714285


Training with 6-Fold Validation

In [20]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

scores = cross_val_score(log_classifier, X, y, cv=6)



In [21]:
print(scores)

[0.80410607 0.8075278  0.7953668  0.8013728  0.7992278  0.79493779]


### Model Building - Bernoulli Naive Bayes

In [80]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

bnb_classifier = BernoulliNB()
bnb_classifier.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [81]:
y_pred = bnb_classifier.predict(X_test) 

In [82]:
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

[[266  14  73  22   6  14  14]
 [  1 326   8  19   7   8   3]
 [ 18  14 315  16  17   5   9]
 [ 19  11  13 323   0   9   6]
 [ 10   9  65   1 300   8   9]
 [  7  28  15   9   5 354   4]
 [ 15   9  25   3  19   8 341]]
               precision    recall  f1-score   support

     BUSINESS       0.79      0.65      0.71       409
ENTERTAINMENT       0.79      0.88      0.83       372
       HEALTH       0.61      0.80      0.69       394
     POLITICS       0.82      0.85      0.83       381
      SCIENCE       0.85      0.75      0.79       402
       SPORTS       0.87      0.84      0.86       422
       TRAVEL       0.88      0.81      0.85       420

    micro avg       0.79      0.79      0.79      2800
    macro avg       0.80      0.80      0.80      2800
 weighted avg       0.80      0.79      0.80      2800

0.7946428571428571


In [84]:
scores = cross_val_score(bnb_classifier, X, y, cv=6)

In [85]:
scores

array([0.79897348, 0.80239521, 0.78549979, 0.7953668 , 0.79279279,
       0.7953668 ])

### Model Building - Support Vector Machine

In [None]:
from sklearn import svm

svm_classifier = svm.SVC(gamma='scale', max_iter = 10)
svm_classifier.fit(X_train, y_train)

In [None]:
y_pred = svm_classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

### Model Building - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

randforest_classifier = RandomForestClassifier(n_estimators=10, random_state=0)  
randforest_classifier.fit(X_train, y_train)

In [None]:
y_pred = randforest_classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

## Model Deployment

In [36]:
import pickle
pickle.dump(log_classifier, open("log_classifier.model", 'wb'))
pickle.dump(tfidf, open("tfidf_kaggledataset.model",'wb'))

## Unseen Data

In [1]:
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict

In [2]:
print("Classes used: ",'POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE')

Classes used:  POLITICS ENTERTAINMENT HEALTH TRAVEL BUSINESS SPORTS SCIENCE


In [24]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

In [25]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

In [26]:
p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus),label)))

In [86]:
X_unseen, y_unseen = [], []
for corpus, label in p_nyt:
    X_unseen.append(corpus)
    y_unseen.append(label)

X_unseen = tfidf.transform(X_unseen).toarray()

y_pred = bnb_classifier.predict(X_unseen)

In [87]:
print(confusion_matrix(y_unseen,y_pred))  
print(classification_report(y_unseen,y_pred))  
print(accuracy_score(y_unseen, y_pred))  

[[34  2  1  5  2  1  3]
 [ 1 27  2  0  2  0  1]
 [ 4  1 13  7  5  2  1]
 [ 1  0  0 18  0  0  1]
 [ 4  0  5  7  8  0  2]
 [ 1  0  0  0  0 18  1]
 [ 2  0  0  1  2  0 18]]
               precision    recall  f1-score   support

     BUSINESS       0.72      0.71      0.72        48
ENTERTAINMENT       0.90      0.82      0.86        33
       HEALTH       0.62      0.39      0.48        33
     POLITICS       0.47      0.90      0.62        20
      SCIENCE       0.42      0.31      0.36        26
       SPORTS       0.86      0.90      0.88        20
       TRAVEL       0.67      0.78      0.72        23

    micro avg       0.67      0.67      0.67       203
    macro avg       0.67      0.69      0.66       203
 weighted avg       0.68      0.67      0.66       203

0.6699507389162561
