In [1]:
# Install ktrain
try:
  import ktrain
except:
  !pip install ktrain
  os.kill(os.getpid(), 9)

In [2]:
import pandas as pd
import numpy as np
import requests

In [29]:
# Check that a GPU is available 
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Mar 19 03:46:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    28W /  70W |  14277MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
articles_json = requests.get('http://128.138.93.164/news_category_trainingdata.json').json()

In [5]:
# Create a dataframe for easier manipulation
articles = pd.DataFrame.from_dict(articles_json)
articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   category           200853 non-null  object
 1   headline           200853 non-null  object
 2   authors            200853 non-null  object
 3   link               200853 non-null  object
 4   short_description  200853 non-null  object
 5   date               200853 non-null  object
dtypes: object(6)
memory usage: 10.7+ MB


There are 200,853 rows with information about the articles along with 6 columns. These columns consist of the categories and description of the articles.

In [6]:
articles['category'].value_counts()

POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
PARENTING          8677
HEALTHY LIVING     6694
QUEER VOICES       6314
FOOD & DRINK       6226
BUSINESS           5937
COMEDY             5175
SPORTS             4884
BLACK VOICES       4528
HOME & LIVING      4195
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3651
WOMEN              3490
IMPACT             3459
DIVORCE            3426
CRIME              3405
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
WORLDPOST          2579
RELIGION           2556
STYLE              2254
SCIENCE            2178
WORLD NEWS         2177
TASTE              2096
TECH               2082
MONEY              1707
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
ARTS & CULTURE     1339
ENVIRONMENT        1323
COLLEGE            1144
LATINO VOICES      1129
CULTURE & ARTS     1030
EDUCATION          1004
Name: category, 

Of the totalo 200,853 rows, only 24,521 are categories for wellness and healthy living.

In [7]:
articles['HealthAndWellness'] = articles['category'].isin(['WELLNESS', 'HEALTHY LIVING'])

In [8]:
articles['Text'] = articles['headline'] + ' ' + articles['short_description']

In [16]:
# Given the class imbalance, we can simply sample the remaining articles to match rather than train it all
final_training = articles.groupby(by='HealthAndWellness').sample(24521)[['Text', 'HealthAndWellness']]
final_training.sample(10)

Unnamed: 0,Text,HealthAndWellness
10067,Congress Passes Stopgap Spending Bill To Avert...,False
112004,Stella Abrera: Ballerina on a Double Mission B...,False
35443,Neil Gorsuch May Miss Chance To Decide The Sup...,False
134574,11 Things You Must Know If You Made A New Year...,True
154391,Women With Heart Disease May Face Inflammation...,True
25096,Neil deGrasse Tyson Burns Donald Trump Over Po...,False
100006,Autism Without Fear: Examining the Legitimacy ...,True
96767,"His NBA Dream Lost, Isaiah Austin Winning in N...",True
162565,"Rethinking 'Always On' Being ""always on"" is th...",True
110739,Depression Is My Profession: A Psychiatrist's ...,True


In [17]:
# Create the training, validation datasets along with the text preprocessor
train, val, preprocess = ktrain.text.texts_from_df(
                                  final_training,
                                  "Text",
                                  label_columns=["HealthAndWellness"],
                                  val_df=None,
                                  max_features=20000,
                                  maxlen=512,
                                  val_pct=0.2,
                                  ngram_range=1,
                                  preprocess_mode="distilbert",
                                  verbose=1
                              )

['not_HealthAndWellness', 'HealthAndWellness']
        not_HealthAndWellness  HealthAndWellness
180689                    0.0                1.0
131686                    0.0                1.0
80269                     0.0                1.0
100866                    1.0                0.0
195642                    1.0                0.0
['not_HealthAndWellness', 'HealthAndWellness']
        not_HealthAndWellness  HealthAndWellness
33018                     1.0                0.0
51563                     1.0                0.0
167159                    1.0                0.0
170459                    0.0                1.0
109319                    0.0                1.0
preprocessing train...
language: en
train sequence lengths:
	mean : 32
	95percentile : 58
	99percentile : 68


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 32
	95percentile : 58
	99percentile : 67


In [18]:
model = preprocess.get_classifier()
learner = ktrain.get_learner(model, train_data=train, val_data=val, batch_size=16)

In [20]:
history=learner.autofit(
      1e-4,
      checkpoint_folder='checkpoint',
      epochs=10,
      early_stopping=True
  )



begin training using triangular learning rate policy with max lr of 0.0001...
Epoch 1/10
Epoch 2/10
Epoch 2: early stopping
Weights from best epoch have been loaded into model.


In [21]:
predictor = ktrain.get_predictor(learner.model, preproc=preprocess)

In [22]:
validation = learner.validate(val_data=val, print_report=True)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      5020
           1       0.91      0.93      0.92      4789

    accuracy                           0.92      9809
   macro avg       0.92      0.92      0.92      9809
weighted avg       0.92      0.92      0.92      9809

