In [1]:
import json

with open('../../material/data/news_dataset.json') as json_file:  
    data = json.load(json_file)

In [2]:
data = data['articles']

In [3]:
from datetime import datetime


#allowed_categories = ['POLITICS', 'TRAVEL', 'SPORTS', 'RELIGION', 'SCIENCE', 'TECH', 'ARTS']
allowed_categories = {'POLITICS': 0, 'ENTERTAINMENT': 0, 'HEALTHY LIVING': 0, 'TRAVEL': 0, 'BUSINESS': 0, 'SPORTS':0, 'SCIENCE': 0}

#max_per_cat = 1000

filtered_data = []
filter_date = datetime.strptime('2017-08-01', "%Y-%m-%d")

for dct in data:
    if dct['category'] in allowed_categories:
        datetime_object = datetime.strptime(dct['date'], '%Y-%m-%d')
        if dct['category'] in ['POLITICS', 'ENTERTAINMENT']:
            if datetime_object >= filter_date:
                filtered_data.append(dct)
                allowed_categories[dct['category']]+=1
        else:
            filtered_data.append(dct)
            allowed_categories[dct['category']]+=1

In [4]:
allowed_categories

{'POLITICS': 7049,
 'ENTERTAINMENT': 3058,
 'HEALTHY LIVING': 6694,
 'TRAVEL': 9887,
 'BUSINESS': 5937,
 'SPORTS': 4884,
 'SCIENCE': 2178}

In [5]:
import random

politics = []
entertainment = []
health = []
travel = []
business = []
sports = []
science = []

for dct in filtered_data:
    if dct['category'] == "POLITICS":
        politics.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "POLITICS")))
    if dct['category'] == "ENTERTAINMENT":
        entertainment.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "ENTERTAINMENT")))
    if dct['category'] == "HEALTHY LIVING":
        health.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "HEALTH")))
    if dct['category'] == "TRAVEL":
        travel.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "TRAVEL")))
    if dct['category'] == "BUSINESS":
        business.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "BUSINESS")))
    if dct['category'] == "SPORTS":
        sports.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SPORTS")))
    if dct['category'] == "SCIENCE":
        science.append(tuple((str(dct['headline']) + '. ' + str(dct['short_description']), "SCIENCE")))
        
max_samples = 2000
politics = random.sample(politics, max_samples)
entertainment = random.sample(entertainment, max_samples)
health = random.sample(health, max_samples)
travel = random.sample(travel, max_samples)
business = random.sample(business, max_samples)
sports = random.sample(sports, max_samples)
science = random.sample(science, max_samples)

articles = politics + entertainment + health + travel + business + sports + science

In [6]:
def count_samples_per_cat(samples, cats):
    counts = {}
    for name in cats:
        counts[name] = 0
    
    for corpus, label in samples:
        counts[label]+=1
        
    return counts

In [7]:
print(count_samples_per_cat(articles, ['POLITICS', 'ENTERTAINMENT', 'HEALTH', 'TRAVEL', 'BUSINESS', 'SPORTS', 'SCIENCE']))

{'POLITICS': 2000, 'ENTERTAINMENT': 2000, 'HEALTH': 2000, 'TRAVEL': 2000, 'BUSINESS': 2000, 'SPORTS': 2000, 'SCIENCE': 2000}


### Preprocessing

In [8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

In [9]:
def preprocess_document(doc):
    stopset = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return " ".join(final)

In [10]:
p_articles = []
for corpus, label in articles:
    p_corpus = preprocess_document(corpus)
    p_articles.append(tuple((p_corpus, label)))

### Feature Extraction

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import array

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

X = array([corpus for corpus, label in articles])
y = array([label for corpus, label in articles])

In [13]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

### Model Checking - Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [15]:
log_classifier = LogisticRegression(solver='lbfgs', multi_class = 'auto', verbose=2, max_iter = 400)

Training with 8-Fold Validation

In [16]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = tfidf.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    log_classifier.fit(X_train, y_train)  
    
    X_test = tfidf.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = log_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.0s finished


[[187   7  29  15   2   7  10]
 [  6 209   6  16   7  10   3]
 [ 22   7 193   5  11   9   9]
 [ 12   9  12 200   1   5   6]
 [  9   5  29   2 193   8  10]
 [  5  13   7   8   7 192   2]
 [ 11   9  13   2   6   5 199]]
               precision    recall  f1-score   support

     BUSINESS       0.74      0.73      0.73       257
ENTERTAINMENT       0.81      0.81      0.81       257
       HEALTH       0.67      0.75      0.71       256
     POLITICS       0.81      0.82      0.81       245
      SCIENCE       0.85      0.75      0.80       256
       SPORTS       0.81      0.82      0.82       234
       TRAVEL       0.83      0.81      0.82       245

    micro avg       0.78      0.78      0.78      1750
    macro avg       0.79      0.79      0.79      1750
 weighted avg       0.79      0.78      0.79      1750

0.7845714285714286


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.4s finished


[[186   4  29  12   5   4  12]
 [  8 189   6  12   3   4   4]
 [ 18  10 174  10  20   3  11]
 [ 16   4  11 216   5   5   3]
 [ 10   2  27   2 203   6   7]
 [  2  17   2   2   4 211   7]
 [ 16   6  12   2   4   4 220]]
               precision    recall  f1-score   support

     BUSINESS       0.73      0.74      0.73       252
ENTERTAINMENT       0.81      0.84      0.83       226
       HEALTH       0.67      0.71      0.69       246
     POLITICS       0.84      0.83      0.84       260
      SCIENCE       0.83      0.79      0.81       257
       SPORTS       0.89      0.86      0.88       245
       TRAVEL       0.83      0.83      0.83       264

    micro avg       0.80      0.80      0.80      1750
    macro avg       0.80      0.80      0.80      1750
 weighted avg       0.80      0.80      0.80      1750

0.7994285714285714


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.9s finished


[[186   8  24  15   6   1   5]
 [  5 221   7  17   6  12   6]
 [ 17   8 193   7  13   3  11]
 [ 16   9   7 202   3   4   4]
 [ 12   7  26   4 183   5   9]
 [  7  16   6   6   3 203   5]
 [ 11   5  10   1  10   3 202]]
               precision    recall  f1-score   support

     BUSINESS       0.73      0.76      0.75       245
ENTERTAINMENT       0.81      0.81      0.81       274
       HEALTH       0.71      0.77      0.74       252
     POLITICS       0.80      0.82      0.81       245
      SCIENCE       0.82      0.74      0.78       246
       SPORTS       0.88      0.83      0.85       246
       TRAVEL       0.83      0.83      0.83       242

    micro avg       0.79      0.79      0.79      1750
    macro avg       0.80      0.79      0.79      1750
 weighted avg       0.80      0.79      0.79      1750

0.7942857142857143


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   33.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   33.1s finished


[[195   5  30  20   7   6   7]
 [  8 202   9  13   3   6   6]
 [ 15   6 169   3  12   7   7]
 [ 16  12  12 216   2   5   4]
 [  9   4  23   2 194   2   5]
 [  4  13   9   5   4 219   6]
 [  8   2   6   0   9   4 219]]
               precision    recall  f1-score   support

     BUSINESS       0.76      0.72      0.74       270
ENTERTAINMENT       0.83      0.82      0.82       247
       HEALTH       0.66      0.77      0.71       219
     POLITICS       0.83      0.81      0.82       267
      SCIENCE       0.84      0.81      0.83       239
       SPORTS       0.88      0.84      0.86       260
       TRAVEL       0.86      0.88      0.87       248

    micro avg       0.81      0.81      0.81      1750
    macro avg       0.81      0.81      0.81      1750
 weighted avg       0.81      0.81      0.81      1750

0.808


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.9s finished


[[179  11  22  19   6   3  10]
 [  5 209   4  13   2  12   8]
 [ 19   4 175   9  11   3   7]
 [ 15  10  15 208   1   6   3]
 [ 10   3  25   2 195   2  10]
 [  8  15   8   4   1 219   1]
 [ 12   7   7   4   6   5 217]]
               precision    recall  f1-score   support

     BUSINESS       0.72      0.72      0.72       250
ENTERTAINMENT       0.81      0.83      0.82       253
       HEALTH       0.68      0.77      0.72       228
     POLITICS       0.80      0.81      0.80       258
      SCIENCE       0.88      0.79      0.83       247
       SPORTS       0.88      0.86      0.87       256
       TRAVEL       0.85      0.84      0.84       258

    micro avg       0.80      0.80      0.80      1750
    macro avg       0.80      0.80      0.80      1750
 weighted avg       0.80      0.80      0.80      1750

0.8011428571428572


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.0s finished


[[173   6  30  14   4   1  10]
 [  7 202   7   9   5  15   2]
 [ 19  12 202   7  18   3   8]
 [ 19   8   5 190   0   5   4]
 [ 13  10  21   4 193   8   9]
 [  4  11   8   3   2 224   6]
 [ 13   7  10   1   8   5 205]]
               precision    recall  f1-score   support

     BUSINESS       0.70      0.73      0.71       238
ENTERTAINMENT       0.79      0.82      0.80       247
       HEALTH       0.71      0.75      0.73       269
     POLITICS       0.83      0.82      0.83       231
      SCIENCE       0.84      0.75      0.79       258
       SPORTS       0.86      0.87      0.86       258
       TRAVEL       0.84      0.82      0.83       249

    micro avg       0.79      0.79      0.79      1750
    macro avg       0.80      0.79      0.79      1750
 weighted avg       0.80      0.79      0.79      1750

0.7937142857142857


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   34.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   34.3s finished


[[173   2  28  11   7   5  14]
 [  6 207   8  13   5  11   5]
 [ 19   8 199   5  18   4  10]
 [ 16   6   5 198   2   4   4]
 [ 12   8  31   3 187   6   9]
 [  3   5   7   6   4 212   5]
 [  8   7  11   1  18   4 210]]
               precision    recall  f1-score   support

     BUSINESS       0.73      0.72      0.73       240
ENTERTAINMENT       0.85      0.81      0.83       255
       HEALTH       0.69      0.76      0.72       263
     POLITICS       0.84      0.84      0.84       235
      SCIENCE       0.78      0.73      0.75       256
       SPORTS       0.86      0.88      0.87       242
       TRAVEL       0.82      0.81      0.81       259

    micro avg       0.79      0.79      0.79      1750
    macro avg       0.79      0.79      0.79      1750
 weighted avg       0.79      0.79      0.79      1750

0.792


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.0s finished


[[170   9  37  11   9   6   6]
 [  7 195   7  13   6   8   5]
 [ 20   7 191   8  24   7  10]
 [ 17  11   5 210   5   9   2]
 [  3   6  26   2 189   4  11]
 [  7  20   8   8   7 204   5]
 [ 11   5   6   1   5   5 202]]
               precision    recall  f1-score   support

     BUSINESS       0.72      0.69      0.70       248
ENTERTAINMENT       0.77      0.81      0.79       241
       HEALTH       0.68      0.72      0.70       267
     POLITICS       0.83      0.81      0.82       259
      SCIENCE       0.77      0.78      0.78       241
       SPORTS       0.84      0.79      0.81       259
       TRAVEL       0.84      0.86      0.85       235

    micro avg       0.78      0.78      0.78      1750
    macro avg       0.78      0.78      0.78      1750
 weighted avg       0.78      0.78      0.78      1750

0.7777142857142857


### Model Checking - Bernoulli Naive Bayes

In [17]:
from sklearn.naive_bayes import BernoulliNB

bnb_classifier = BernoulliNB(fit_prior=True)

In [18]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = tfidf.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    bnb_classifier.fit(X_train, y_train)  
    
    X_test = tfidf.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = bnb_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[[164  10  44  16   2   7  14]
 [  3 210  12  12   5  13   2]
 [ 13   6 207   1  12   9   8]
 [  9  13  16 196   2   6   3]
 [  5   4  47   2 182  12   4]
 [  6  21  12   8   4 179   4]
 [  7   9  18   2   7   7 195]]
               precision    recall  f1-score   support

     BUSINESS       0.79      0.64      0.71       257
ENTERTAINMENT       0.77      0.82      0.79       257
       HEALTH       0.58      0.81      0.68       256
     POLITICS       0.83      0.80      0.81       245
      SCIENCE       0.85      0.71      0.77       256
       SPORTS       0.77      0.76      0.77       234
       TRAVEL       0.85      0.80      0.82       245

    micro avg       0.76      0.76      0.76      1750
    macro avg       0.78      0.76      0.76      1750
 weighted avg       0.78      0.76      0.76      1750

0.7617142857142857
[[168   4  43  15   6   9   7]
 [  4 187  11  11   3   6   4]
 [  6   8 193   9  12   5  13]
 [ 15  10  14 212   3   4   2]
 [ 10   4  38   2 193   6   4]


### Model Checking - Support Vector Machine

In [19]:
from sklearn import svm

svm_classifier = svm.LinearSVC(max_iter = 1000)

In [20]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = tfidf.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    svm_classifier.fit(X_train, y_train)  
    
    X_test = tfidf.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = svm_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[[179   9  29  19   3   3  15]
 [  6 206   7  15   8   9   6]
 [ 26  10 177   5  18  13   7]
 [ 13   9   9 199   2   8   5]
 [ 11   5  32   3 187   5  13]
 [  8  11   8   8  11 186   2]
 [ 12   6  13   4   9   7 194]]
               precision    recall  f1-score   support

     BUSINESS       0.70      0.70      0.70       257
ENTERTAINMENT       0.80      0.80      0.80       257
       HEALTH       0.64      0.69      0.67       256
     POLITICS       0.79      0.81      0.80       245
      SCIENCE       0.79      0.73      0.76       256
       SPORTS       0.81      0.79      0.80       234
       TRAVEL       0.80      0.79      0.80       245

    micro avg       0.76      0.76      0.76      1750
    macro avg       0.76      0.76      0.76      1750
 weighted avg       0.76      0.76      0.76      1750

0.7588571428571429
[[176   6  28  15   8   3  16]
 [ 10 183   7  15   3   3   5]
 [ 19  12 165  11  24   4  11]
 [ 15   4  13 213   4   6   5]
 [ 11   2  23   3 201   8   9]


### Model Checking - Multinomial Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB

mnb_classifier = MultinomialNB()

In [22]:
kfold = KFold(8, True, 1) # random seed = 1

for train, test in kfold.split(X):

    X_train = tfidf.fit_transform(X[train]).toarray()
    y_train = y[train]
    
    mnb_classifier.fit(X_train, y_train)  
    
    X_test = tfidf.transform(X[test]).toarray()
    y_test = y[test]
    
    y_pred = mnb_classifier.predict(X_test)
    
    # get statistics
    print(confusion_matrix(y_test,y_pred))  
    print(classification_report(y_test,y_pred))  
    print(accuracy_score(y_test, y_pred))

[[178   9  30  19   1   3  17]
 [  4 209   7  16   6  13   2]
 [ 15   5 197   5  13   8  13]
 [  9  10  11 202   2   7   4]
 [  7   2  39   4 186  10   8]
 [  7  14   7  15   3 184   4]
 [  5   7   9   2   6   4 212]]
               precision    recall  f1-score   support

     BUSINESS       0.79      0.69      0.74       257
ENTERTAINMENT       0.82      0.81      0.81       257
       HEALTH       0.66      0.77      0.71       256
     POLITICS       0.77      0.82      0.80       245
      SCIENCE       0.86      0.73      0.79       256
       SPORTS       0.80      0.79      0.79       234
       TRAVEL       0.82      0.87      0.84       245

    micro avg       0.78      0.78      0.78      1750
    macro avg       0.79      0.78      0.78      1750
 weighted avg       0.79      0.78      0.78      1750

0.7817142857142857
[[176   3  35  19   2   4  13]
 [  6 187   6  14   4   2   7]
 [ 10   9 180  11  13   3  20]
 [ 16   5  12 219   3   3   2]
 [  7   3  36   3 198   5   5]
