In [1]:
import json

with open('../../material/data/news_dataset.json') as json_file:  
    data = json.load(json_file)

In [2]:
data = data['articles']

In [3]:
from datetime import datetime


#allowed_categories = ['POLITICS', 'TRAVEL', 'SPORTS', 'RELIGION', 'SCIENCE', 'TECH', 'ARTS']
allowed_categories = {'POLITICS': 0, 'ENTERTAINMENT': 0, 'HEALTHY LIVING': 0, 'TRAVEL': 0, 'BUSINESS': 0, 'SPORTS':0, 'SCIENCE': 0}

#max_per_cat = 1000

filtered_data = []
filter_date = datetime.strptime('2017-08-01', "%Y-%m-%d")

for dct in data:
    if dct['category'] in allowed_categories:
        datetime_object = datetime.strptime(dct['date'], '%Y-%m-%d')
        if dct['category'] in ['POLITICS', 'ENTERTAINMENT']:
            if datetime_object >= filter_date:
                filtered_data.append(dct)
                allowed_categories[dct['category']]+=1
        else:
            filtered_data.append(dct)
            allowed_categories[dct['category']]+=1

In [4]:
allowed_categories

{'POLITICS': 7049,
 'ENTERTAINMENT': 3058,
 'HEALTHY LIVING': 6694,
 'TRAVEL': 9887,
 'BUSINESS': 5937,
 'SPORTS': 4884,
 'SCIENCE': 2178}

In [5]:
import random

politics = []
entertainment = []
health = []
travel = []
business = []
sports = []
science = []

for dct in filtered_data:
    if dct['category'] == "POLITICS":
        politics.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "POLITICS")))
    if dct['category'] == "ENTERTAINMENT":
        entertainment.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "ENTERTAINMENT")))
    if dct['category'] == "HEALTHY LIVING":
        health.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "HEALTH")))
    if dct['category'] == "TRAVEL":
        travel.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "TRAVEL")))
    if dct['category'] == "BUSINESS":
        business.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "BUSINESS")))
    if dct['category'] == "SPORTS":
        sports.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SPORTS")))
    if dct['category'] == "SCIENCE":
        science.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SCIENCE")))
        
max_samples = 2000
politics = random.sample(politics, max_samples)
entertainment = random.sample(entertainment, max_samples)
health = random.sample(health, max_samples)
travel = random.sample(travel, max_samples)
business = random.sample(business, max_samples)
sports = random.sample(sports, max_samples)
science = random.sample(science, max_samples)

articles = politics + entertainment + health + travel + business + sports + science

In [6]:
def count_samples_per_cat(samples, cats):
    counts = {}
    for name in cats:
        counts[name] = 0
    
    for corpus, label in samples:
        counts[label]+=1
        
    return counts

In [7]:
print(count_samples_per_cat(articles, ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 2000, 'ENTERTAINMENT': 2000, 'HEALTH': 2000, 'TRAVEL': 2000, 'BUSINESS': 2000, 'SPORTS': 2000, 'SCIENCE': 2000}


### Preprocessing

In [8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

In [9]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

In [10]:
def count_samples_per_cat(samples, cats):
    counts = {}
    for name in cats:
        counts[name] = 0
    
    for corpus, label in samples:
        counts[label]+=1
        
    return counts

In [11]:
X, y = [], []
for corpus, label in articles:
    p_corpus = preprocess_document(corpus)
    if len(p_corpus) > 0:
        X.append(p_corpus)
        y.append(label)

In [12]:
print(count_samples_per_cat(zip(X,y), ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 2000, 'ENTERTAINMENT': 2000, 'HEALTH': 1999, 'TRAVEL': 2000, 'BUSINESS': 2000, 'SPORTS': 2000, 'SCIENCE': 2000}


### Feature Extraction

In [14]:
from nltk import FreqDist

dict_from_articles = [] # list of all tokens contained in the whole train set of articles
for tokens in X: 
    dict_from_articles = dict_from_articles + tokens

print(len(dict_from_articles))
fdist = FreqDist(dict_from_articles) # compute frequency distribution

print(fdist)
topK = fdist.most_common(3000)

dictionary = []
for word, count in topK:
    dictionary.append(word)
    
print(len(dictionary))
print(dictionary[1:5])

233351
<FreqDist with 18365 samples and 233351 outcomes>
3000
['trump', 'one', 'year', 'time']


In [15]:
def extract_MVB_features(tokens):
    feature_vec = {}
    
    for word in dictionary:
        if word in tokens:
            feature_vec[word] = 1
        else:
            feature_vec[word] = 0
            
    return feature_vec

def extract_M_features(tokens):
    feature_vec = {}
    freqs = FreqDist(tokens)
        
    for word in dictionary:
        if word in freqs: # if word appears in the phrase
            feature_vec[word] = freqs[word]
        else:
            feature_vec[word] = 0
            
    return feature_vec

def extract_MNorm_features(tokens):
    feature_vec = {}
    freqs = FreqDist(tokens)
    div = len(tokens)
    
    for word in dictionary:
        if word in freqs: # if word appears in the phrase
            feature_vec[word] = freqs[word]
        else:
            feature_vec[word] = 0
        feature_vec[word] = round(feature_vec[word]/div,2)
            
    return feature_vec

In [None]:
X_mvb = [extract_MVB_features(corpus) for corpus in X]
X_m = [extract_M_features(corpus) for corpus in X]
X_mn = [extract_MNorm_features(corpus) for corpus in X]

In [20]:
y_mvb = y.copy()
y_m = y.copy()
y_mn = y.copy()

### Train Test splitting

In [21]:
from sklearn.model_selection import train_test_split  
X_train_mvb, X_test_mvb, y_train_mvb, y_test_mvb = train_test_split(X_mvb, y_mvb, test_size=0.2, random_state=0)  
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2, random_state=0)
X_train_mn, X_test_mn, y_train_mn, y_test_mn = train_test_split(X_mn, y_mn, test_size=0.2, random_state=0)

In [22]:
print(count_samples_per_cat(zip(X_train, y_train), ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))
print(count_samples_per_cat(zip(X_test, y_test), ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 1619, 'ENTERTAINMENT': 1628, 'HEALTH': 1605, 'TRAVEL': 1568, 'BUSINESS': 1593, 'SPORTS': 1589, 'SCIENCE': 1597}
{'POLITICS': 381, 'ENTERTAINMENT': 372, 'HEALTH': 394, 'TRAVEL': 432, 'BUSINESS': 407, 'SPORTS': 411, 'SCIENCE': 403}


## Model Building - Naive Bayes Classifier

### Multi-Variate Bernoulli Feature Set

In [24]:
import nltk

bayes_classifier_mvb = nltk.NaiveBayesClassifier.train(zip(X_train_mvb, y_train_mvb))
print("MVB: "+str(nltk.classify.accuracy(bayes_classifier_mvb, zip(X_test_mvb, y_test_mvb))))
bayes_classifier_mvb.show_most_informative_features(5)

MVB: 0
Most Informative Features
                  travel = 1              TRAVEL : ENTERT =    233.6 : 1.0
                   trump = 1              POLITI : TRAVEL =    195.8 : 1.0
               scientist = 1              SCIENC : SPORTS =    135.0 : 1.0
                   studi = 1              SCIENC : ENTERT =    103.0 : 1.0
                research = 1              SCIENC : ENTERT =     98.9 : 1.0


### Model Building - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_classifier = LogisticRegression(solver='lbfgs', multi_class = 'auto')
log_classifier.fit(X_train, y_train)  

In [None]:
y_pred = log_classifier.predict(X_test) 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

Training with 6-Fold Validation

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

scores = cross_val_score(log_classifier, X, y, cv=6)

In [None]:
print(scores)

### Model Building - Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

bnb_classifier = BernoulliNB()
bnb_classifier.fit(X_train, y_train)

In [None]:
y_pred = bnb_classifier.predict(X_test) 

In [None]:
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

In [None]:
scores = cross_val_score(bnb_classifier, X, y, cv=6)

In [None]:
scores

### Model Building - Support Vector Machine

In [None]:
from sklearn import svm

svm_classifier = svm.SVC(gamma='scale', max_iter = 10)
svm_classifier.fit(X_train, y_train)

In [None]:
y_pred = svm_classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

### Model Building - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

randforest_classifier = RandomForestClassifier(n_estimators=10, random_state=0)  
randforest_classifier.fit(X_train, y_train)

In [None]:
y_pred = randforest_classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

## Model Deployment

In [None]:
import pickle
pickle.dump(log_classifier, open("log_classifier.model", 'wb'))
pickle.dump(tfidf, open("tfidf_kaggledataset.model",'wb'))

## Unseen Data

In [None]:
import urllib3 as urllib
urllib.disable_warnings()
import xmltodict

In [None]:
print("Classes used: ",'POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE')

In [None]:
def nyt_retrieve(nyt_rss_url, label): # retrieve articles from New York Times
    articles = []
    
    # code dependent on the nytimes structure of RSS feed
    http = urllib.PoolManager()
    r = http.request('GET', nyt_rss_url)

    data = xmltodict.parse(r.data)
    data = data["rss"]
    data = data["channel"]
    data = data["item"]

    for key in data:
        article = key
        title, descr, extra_descr = "", "", ""
        if "title" in article and article["title"] is not None:
            title = article["title"] + ". "
        if "media:description" in article and article["media:description"] is not None:
            descr = article["media:description"]
        if "description" in article and article["description"] is not None:
            extra_descr = article["description"]

        corpus = str(title) + str(descr) + str(extra_descr)
        articles.append(tuple((corpus, label)))
        
    return articles

In [None]:
nyt = []
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Politics.xml', 'POLITICS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Movies.xml', 'ENTERTAINMENT')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 'HEALTH')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 'TRAVEL')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml', 'BUSINESS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 'SPORTS')
nyt += nyt_retrieve('http://rss.nytimes.com/services/xml/rss/nyt/Science.xml', 'SCIENCE')

In [None]:
p_nyt = []
for corpus, label in nyt:
    p_nyt.append(tuple((preprocess_document(corpus),label)))

In [None]:
X_unseen, y_unseen = [], []
for corpus, label in nyt:
    X_unseen.append(corpus)
    y_unseen.append(label)

X_unseen = tfidf.transform(X_unseen).toarray()

y_pred = bnb_classifier.predict(X_unseen)

In [None]:
print(confusion_matrix(y_unseen,y_pred))  
print(classification_report(y_unseen,y_pred))  
print(accuracy_score(y_unseen, y_pred))  