In [1]:
import json

with open('../../material/data/news_dataset.json') as json_file:  
    data = json.load(json_file)

In [2]:
data = data['articles']

In [3]:
from datetime import datetime


#allowed_categories = ['POLITICS', 'TRAVEL', 'SPORTS', 'RELIGION', 'SCIENCE', 'TECH', 'ARTS']
allowed_categories = {'POLITICS': 0, 'ENTERTAINMENT': 0, 'HEALTHY LIVING': 0, 'TRAVEL': 0, 'BUSINESS': 0, 'SPORTS':0, 'SCIENCE': 0}

#max_per_cat = 1000

filtered_data = []
filter_date = datetime.strptime('2017-08-01', "%Y-%m-%d")

for dct in data:
    if dct['category'] in allowed_categories:
        datetime_object = datetime.strptime(dct['date'], '%Y-%m-%d')
        if dct['category'] in ['POLITICS', 'ENTERTAINMENT']:
            if datetime_object >= filter_date:
                filtered_data.append(dct)
                allowed_categories[dct['category']]+=1
        else:
            filtered_data.append(dct)
            allowed_categories[dct['category']]+=1

In [4]:
allowed_categories

{'POLITICS': 7049,
 'ENTERTAINMENT': 3058,
 'HEALTHY LIVING': 6694,
 'TRAVEL': 9887,
 'BUSINESS': 5937,
 'SPORTS': 4884,
 'SCIENCE': 2178}

In [5]:
import random

politics = []
entertainment = []
health = []
travel = []
business = []
sports = []
science = []

for dct in filtered_data:
    if dct['category'] == "POLITICS":
        politics.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "POLITICS")))
    if dct['category'] == "ENTERTAINMENT":
        entertainment.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "ENTERTAINMENT")))
    if dct['category'] == "HEALTHY LIVING":
        health.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "HEALTH")))
    if dct['category'] == "TRAVEL":
        travel.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "TRAVEL")))
    if dct['category'] == "BUSINESS":
        business.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "BUSINESS")))
    if dct['category'] == "SPORTS":
        sports.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SPORTS")))
    if dct['category'] == "SCIENCE":
        science.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SCIENCE")))
        
max_samples = 2000
politics = random.sample(politics, max_samples)
entertainment = random.sample(entertainment, max_samples)
health = random.sample(health, max_samples)
travel = random.sample(travel, max_samples)
business = random.sample(business, max_samples)
sports = random.sample(sports, max_samples)
science = random.sample(science, max_samples)

articles = politics + entertainment + health + travel + business + sports + science

In [6]:
def count_samples_per_cat(samples, cats):
    counts = {}
    for name in cats:
        counts[name] = 0
    
    for corpus, label in samples:
        counts[label]+=1
        
    return counts

In [7]:
print(count_samples_per_cat(articles, ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 2000, 'ENTERTAINMENT': 2000, 'HEALTH': 2000, 'TRAVEL': 2000, 'BUSINESS': 2000, 'SPORTS': 2000, 'SCIENCE': 2000}


### Preprocessing

In [8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

In [9]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return " ".join(final)

In [10]:
p_articles = []
for corpus, label in articles:
    p_corpus = preprocess_document(corpus)
    p_articles.append(tuple((p_corpus, label)))

### Feature Extraction

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from numpy import array

wordfreq = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=5, stop_words = 'english')

X = array([corpus for corpus, label in articles])
y = array([label for corpus, label in articles])

In [12]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

### Model Checking - Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [34]:
log_classifier = LogisticRegression(solver='lbfgs', multi_class = 'auto', verbose=2, max_iter = 400)

Training with 8-Fold Validation

In [36]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = wordfreq.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    log_classifier.fit(X_train, y_train)  
    
    X_test = wordfreq.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = log_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.8s finished


[[169   5  30  10  12   8   8]
 [  6 191   9  19   5   9   6]
 [ 17   3 188   5  15  12  11]
 [ 19  12   6 216   4   8   4]
 [ 11   2  35   6 166   5  10]
 [  6  12  11   4   2 204   7]
 [ 10   3  10   4  10   3 222]]
               precision    recall  f1-score   support

     BUSINESS       0.71      0.70      0.70       242
ENTERTAINMENT       0.84      0.78      0.81       245
       HEALTH       0.65      0.75      0.70       251
     POLITICS       0.82      0.80      0.81       269
      SCIENCE       0.78      0.71      0.74       235
       SPORTS       0.82      0.83      0.82       246
       TRAVEL       0.83      0.85      0.84       262

    micro avg       0.77      0.77      0.77      1750
    macro avg       0.78      0.77      0.77      1750
 weighted avg       0.78      0.77      0.78      1750

0.7748571428571429


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.7s finished


[[194  11  25  18  13   6   5]
 [  8 219   8  14   3  10   3]
 [ 26   5 166   4  18   7   6]
 [ 13  10   8 207   3   6   5]
 [  9   5  24   1 189   6   6]
 [  5  14   9   5   7 187   2]
 [ 12   4  18   3  14   4 205]]
               precision    recall  f1-score   support

     BUSINESS       0.73      0.71      0.72       272
ENTERTAINMENT       0.82      0.83      0.82       265
       HEALTH       0.64      0.72      0.68       232
     POLITICS       0.82      0.82      0.82       252
      SCIENCE       0.77      0.79      0.78       240
       SPORTS       0.83      0.82      0.82       229
       TRAVEL       0.88      0.79      0.83       260

    micro avg       0.78      0.78      0.78      1750
    macro avg       0.78      0.78      0.78      1750
 weighted avg       0.78      0.78      0.78      1750

0.7811428571428571


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.5s finished


[[173   9  23  20  13   9   9]
 [  2 180  11   8  11   9   4]
 [ 19  10 205  10  13   3  10]
 [ 18  11   4 204   5   8   5]
 [  8   2  28   6 175   6  13]
 [  7  13  17   6   9 207   6]
 [  9   6  18   4   7   8 189]]
               precision    recall  f1-score   support

     BUSINESS       0.73      0.68      0.70       256
ENTERTAINMENT       0.78      0.80      0.79       225
       HEALTH       0.67      0.76      0.71       270
     POLITICS       0.79      0.80      0.80       255
      SCIENCE       0.75      0.74      0.74       238
       SPORTS       0.83      0.78      0.80       265
       TRAVEL       0.80      0.78      0.79       241

    micro avg       0.76      0.76      0.76      1750
    macro avg       0.76      0.76      0.76      1750
 weighted avg       0.76      0.76      0.76      1750

0.7617142857142857


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   43.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   43.2s finished


[[169   4  29  16   9   9  14]
 [  6 192   8  12   5  11   3]
 [ 20   5 197   9  20   3  13]
 [ 17  15  11 193   7   3   6]
 [  9   5  30   2 193   7  11]
 [  3  21   7   3   8 213   2]
 [  5   1  17   1   8   6 192]]
               precision    recall  f1-score   support

     BUSINESS       0.74      0.68      0.71       250
ENTERTAINMENT       0.79      0.81      0.80       237
       HEALTH       0.66      0.74      0.70       267
     POLITICS       0.82      0.77      0.79       252
      SCIENCE       0.77      0.75      0.76       257
       SPORTS       0.85      0.83      0.84       257
       TRAVEL       0.80      0.83      0.82       230

    micro avg       0.77      0.77      0.77      1750
    macro avg       0.77      0.77      0.77      1750
 weighted avg       0.77      0.77      0.77      1750

0.7708571428571429


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.5s finished


[[162   5  24  14   5   9   8]
 [ 13 228   3  13   5  15   7]
 [ 16   4 183   2  23   5  11]
 [ 15  10   7 200   3   6   4]
 [ 10   4  38   2 181   6   7]
 [  5  12  11   6   8 214   1]
 [ 12   6  12   4  14   7 190]]
               precision    recall  f1-score   support

     BUSINESS       0.70      0.71      0.70       227
ENTERTAINMENT       0.85      0.80      0.82       284
       HEALTH       0.66      0.75      0.70       244
     POLITICS       0.83      0.82      0.82       245
      SCIENCE       0.76      0.73      0.74       248
       SPORTS       0.82      0.83      0.82       257
       TRAVEL       0.83      0.78      0.80       245

    micro avg       0.78      0.78      0.78      1750
    macro avg       0.78      0.77      0.77      1750
 weighted avg       0.78      0.78      0.78      1750

0.776


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.8s finished


[[176   7  40   8  14   6  12]
 [  8 190  12  15   5  16   5]
 [ 18  10 166   6  27   4  13]
 [ 19   9  11 179   3   6   4]
 [  6   8  20   2 221  10   7]
 [  7  18   4   2   6 201   3]
 [ 15   8  13   2  17   6 185]]
               precision    recall  f1-score   support

     BUSINESS       0.71      0.67      0.69       263
ENTERTAINMENT       0.76      0.76      0.76       251
       HEALTH       0.62      0.68      0.65       244
     POLITICS       0.84      0.77      0.80       231
      SCIENCE       0.75      0.81      0.78       274
       SPORTS       0.81      0.83      0.82       241
       TRAVEL       0.81      0.75      0.78       246

    micro avg       0.75      0.75      0.75      1750
    macro avg       0.76      0.75      0.75      1750
 weighted avg       0.76      0.75      0.75      1750

0.7531428571428571


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.8s finished


[[165   6  24  26   7   4  11]
 [  2 206   9  12   8  10   2]
 [  9   3 172  10  22  11   9]
 [ 21   6  16 191   7   8   3]
 [ 11   3  29   1 197   6  10]
 [  8  12  18   3   1 213   5]
 [  9   3  16   2   9   4 210]]
               precision    recall  f1-score   support

     BUSINESS       0.73      0.68      0.71       243
ENTERTAINMENT       0.86      0.83      0.84       249
       HEALTH       0.61      0.73      0.66       236
     POLITICS       0.78      0.76      0.77       252
      SCIENCE       0.78      0.77      0.78       257
       SPORTS       0.83      0.82      0.83       260
       TRAVEL       0.84      0.83      0.83       253

    micro avg       0.77      0.77      0.77      1750
    macro avg       0.78      0.77      0.77      1750
 weighted avg       0.78      0.77      0.78      1750

0.7737142857142857


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.9s finished


[[180   6  30  15   7   4   5]
 [  6 194   7   9   3  18   7]
 [ 23   6 187  10  20   2   8]
 [ 19  12  12 190   1   6   4]
 [  9   7  30   5 182  11   7]
 [  8   6   4   9   5 207   6]
 [  9  14  17   3   9   2 209]]
               precision    recall  f1-score   support

     BUSINESS       0.71      0.73      0.72       247
ENTERTAINMENT       0.79      0.80      0.79       244
       HEALTH       0.65      0.73      0.69       256
     POLITICS       0.79      0.78      0.78       244
      SCIENCE       0.80      0.73      0.76       251
       SPORTS       0.83      0.84      0.84       245
       TRAVEL       0.85      0.79      0.82       263

    micro avg       0.77      0.77      0.77      1750
    macro avg       0.77      0.77      0.77      1750
 weighted avg       0.77      0.77      0.77      1750

0.7708571428571429


### Model Checking - Bernoulli Naive Bayes

In [37]:
from sklearn.naive_bayes import BernoulliNB

bnb_classifier = BernoulliNB(fit_prior=True)

In [38]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = wordfreq.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    bnb_classifier.fit(X_train, y_train)  
    
    X_test = wordfreq.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = bnb_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[[153   9  42  16   8   5   9]
 [  3 202   6  15   4  11   4]
 [ 11   4 204   4  12   7   9]
 [ 19  10   9 218   3   7   3]
 [  8   6  40   2 165   7   7]
 [  4  10  11   6   0 212   3]
 [ 12   8  12   1  11   1 217]]
               precision    recall  f1-score   support

     BUSINESS       0.73      0.63      0.68       242
ENTERTAINMENT       0.81      0.82      0.82       245
       HEALTH       0.63      0.81      0.71       251
     POLITICS       0.83      0.81      0.82       269
      SCIENCE       0.81      0.70      0.75       235
       SPORTS       0.85      0.86      0.85       246
       TRAVEL       0.86      0.83      0.84       262

    micro avg       0.78      0.78      0.78      1750
    macro avg       0.79      0.78      0.78      1750
 weighted avg       0.79      0.78      0.78      1750

0.7834285714285715
[[186   8  36  19   6  12   5]
 [  3 216   6  14   6  14   6]
 [ 13   6 186   3  13   8   3]
 [ 10   8   6 215   5   5   3]
 [  7   9  41   1 175   4   3]


### Model Checking - Support Vector Machine

In [42]:
from sklearn import svm

svm_classifier = svm.LinearSVC(max_iter = 1000)

In [43]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = wordfreq.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    svm_classifier.fit(X_train, y_train)  
    
    X_test = wordfreq.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = svm_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[[160   9  27  15  15   7   9]
 [  4 182  12  22   7  13   5]
 [ 21  10 172   6  18  13  11]
 [ 24  12   7 208   5   9   4]
 [ 15   7  32   5 157   6  13]
 [  7  10  12   7   4 200   6]
 [ 10   5  10   5  10   7 215]]
               precision    recall  f1-score   support

     BUSINESS       0.66      0.66      0.66       242
ENTERTAINMENT       0.77      0.74      0.76       245
       HEALTH       0.63      0.69      0.66       251
     POLITICS       0.78      0.77      0.77       269
      SCIENCE       0.73      0.67      0.70       235
       SPORTS       0.78      0.81      0.80       246
       TRAVEL       0.82      0.82      0.82       262

    micro avg       0.74      0.74      0.74      1750
    macro avg       0.74      0.74      0.74      1750
 weighted avg       0.74      0.74      0.74      1750

0.7394285714285714
[[177  13  30  24  14  10   4]
 [  9 204  12  13   6  15   6]
 [ 28   8 160   5  19   7   5]
 [ 15  11  10 193   5  11   7]
 [  9   6  26   2 183   5   9]


### Model Checking - Multinomial Naive Bayes

In [14]:
from sklearn.naive_bayes import MultinomialNB

mnb_classifier = MultinomialNB()

In [15]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = wordfreq.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    mnb_classifier.fit(X_train, y_train)  
    
    X_test = wordfreq.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = mnb_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[[179   4  32  21   3   2  12]
 [  2 232   8  13   6  12   9]
 [ 15   8 165   4  20   7  11]
 [ 13  16   4 206   3   7   4]
 [  6   0  35   2 182  13   8]
 [  3   7   7  14   2 209   4]
 [  6   5   7   3   9   3 207]]
               precision    recall  f1-score   support

     BUSINESS       0.80      0.71      0.75       253
ENTERTAINMENT       0.85      0.82      0.84       282
       HEALTH       0.64      0.72      0.68       230
     POLITICS       0.78      0.81      0.80       253
      SCIENCE       0.81      0.74      0.77       246
       SPORTS       0.83      0.85      0.84       246
       TRAVEL       0.81      0.86      0.84       240

    micro avg       0.79      0.79      0.79      1750
    macro avg       0.79      0.79      0.79      1750
 weighted avg       0.79      0.79      0.79      1750

0.7885714285714286
[[176   6  37  21   7   3  10]
 [  3 211   1   9   7  13   5]
 [ 20   5 195  11   8   2  11]
 [ 14   4   7 197   3   4   4]
 [ 12   5  31   3 175   6   8]
