# PROJECT 1: Categorizing news articles

### Your task
* Given a bunch of Reuters news service articles, develop a set of labels for categorizing them
* Labels should be a single word or short phrase. Some articles might fit more than one label, and some might not fit any.
* Aim for about 10–15 labels, give or take
* Use methods from labs so far (keyword analysis, terminology extraction, topic models)
* No specific ‘correct’ answer; the process you use to develop the list is more important than the solution.

### Deliverables
* List of labels
* For each label, the number of articles from the dataset that fit that label
* The number of articles that don't fit any of the labels (ideally this won't be a big number)
* Annotated notebook showing your process

In [1]:
import pandas as pd
import numpy as np
from cytoolz import *
import re
from tqdm.auto import tqdm

tqdm.pandas()

In [6]:
# Read in the data from the S3 bucket
df = pd.read_parquet('s3://ling583/project1.parquet', storage_options={'anon':True})
len(df)

50085

In [7]:
# Remove non-english articles
import pycld2

def guess_lang(text):
    try:
        reliable, _, langs = pycld2.detect(
            text, isPlainText=True, hintLanguage='en')
        if reliable:
            return langs[0][0]
    except pycld2.error as e:
        pass
    return np.NaN


df['lang'] = df['text'].progress_apply(guess_lang)
df = df[df['lang'] == 'ENGLISH'].reset_index(drop=True)
len(df)

  0%|          | 0/50085 [00:00<?, ?it/s]

49844

In [155]:
df.head()

Unnamed: 0,headline,text,byline,dateline,date,lang,tokens
0,Planet Hollywood launches credit card.,If dining at Planet Hollywood made you feel li...,,LOS ANGELES,1996-08-20,ENGLISH,"[if, dining, at, planet, hollywood, made, you,..."
1,Sprint to offer consumer Internet access service.,Sprint Corp. Tuesday announced plans to offer ...,Susan Nadeau,CHICAGO,1996-08-20,ENGLISH,"[sprint, corp., tuesday, announced, plans, to,..."
2,Chains may raise prices after minimum wage hike.,The higher minimum wage signed into law Tuesda...,Patricia Commins,CHICAGO,1996-08-20,ENGLISH,"[the, higher, minimum, wage, signed, into, law..."
3,Sprint to offer consumer Internet access service.,Sprint Corp. Tuesday announced plans to offer ...,,"KANSAS CITY, Mo.",1996-08-20,ENGLISH,"[sprint, corp., tuesday, announced, plans, to,..."
4,Sprint to offer consumer Internet access service.,Sprint Corp. Tuesday announced plans to offer ...,,"KANSAS CITY, Mo.",1996-08-20,ENGLISH,"[sprint, corp., tuesday, announced, plans, to,..."


## Find Multiple-Word-Expressions (MWE) to help with the labeling

In [9]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm', exclude=[
                 'parser', 'ner', 'lemmatizer', 'attribute_ruler'])

matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN', 'NNP']}},
                      {'TAG': {'IN': ['JJ', 'NN', 'IN',
                                      'HYPH', 'NNP']}, 'OP': '*'},
                      {'TAG': {'IN': ['NN', 'NNP']}}]])


def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans=True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [10]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:41587")
client

0,1
Client  Scheduler: tcp://127.0.0.1:41587  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [12]:
import dask.bag as db
import dask.dataframe as dd

texts = dd.from_pandas(df['text'].sample(
    len(df), random_state=19), npartitions=50).to_bag()

graph = texts.map(get_candidates).flatten().frequencies()

In [13]:
%%time

candidates = graph.compute()

CPU times: user 5.35 s, sys: 781 ms, total: 6.14 s
Wall time: 3min 50s


In [14]:
from nltk import ngrams


def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [15]:
from collections import Counter, defaultdict
from math import log2

freqs = defaultdict(Counter)
for c, f in candidates:
    freqs[len(c)][c] += f


def c_value(F, theta):

    termhood = Counter()
    longer = defaultdict(list)

    for k in sorted(F, reverse=True):
        for term in F[k]:
            if term in longer:
                discount = sum(longer[term]) / len(longer[term])
            else:
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta:
                termhood[term] = c
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood

In [16]:
terms = c_value(freqs, theta=200)

In [17]:
for t, c in terms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:5d} {" ".join(t)}')

 7157.00  7494 new york
 6833.00  7311 hong kong
 5999.67  6325 last year
 5095.36  5879 air cargo
 4348.00  4348 united states
 3760.17  2545 long - distance
 3327.00  3327 percent stake
 3226.00  3226 general cargo
 3166.76  1998 london newsroom +44
 2796.00  1830 air cargo newsroom tel+44
 2539.50  3158 chief executive
 2477.00  2577 co ltd
 2471.00  2694 net income
 2466.00  2466 last week
 2464.00  2464 joint venture
 2393.61  2268 air cargo newsroom
 2383.00  2793 stock exchange
 2299.78  2215 cargo newsroom tel+44
 2228.46  1406 long - term
 2217.00  2217 first quarter


In [18]:
for t, c in tail(20, terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:5d} {" ".join(t)}')

  204.00   204 oil 		 
  204.00   204 industrial action
  204.00   204 plc     
  203.00   203 group inc.
  202.88   128 percent last year
  202.88   128 john f. kennedy
  202.88   128 browns group plc
  202.00   101 nz it 	     london
  202.00   101 year - ago quarter
  202.00   101 sao paulo newsroom +55
  202.00   326 trans world
  202.00   202 vice chairman
  202.00   202 ashok leyland
  202.00   202 cargo official
  201.29   127 long - range
  201.29   127 countrywide - pp
  201.00   201 strategic alliance
  201.00   481 paulo newsroom
  201.00   201 conversion rate
  201.00   201 german airline


In [19]:
# Save the terms for later use
with open('article-terms.txt', 'w') as f:
    for t in terms:
        print(' '.join(t), file=f)

In [23]:
# Tokenize the terms
from tokenizer import MWETokenizer

tokenizer = MWETokenizer(open('article-terms.txt'))

In [24]:
# Add a new column to the dataframe, 'tokens' that is the tokenized version of the 'text' column
df['tokens'] = pd.Series(df['text'].progress_apply(tokenizer.tokenize))

  0%|          | 0/49844 [00:00<?, ?it/s]

# If you have article-topics.bin, skip to next bold comment, otherwise run the below code

### This is just to save time on subsequent runs for refining processes
Running these blocks takes 13 minutes on my machine

In [157]:
import tomotopy as tp
import time
k = 200                  # number of topics
min_df = 20              # minimum number of articles that a term has to occur in to be included in the model
rm_top = 75              # number of most frequent terms to remove from the model
tw = tp.TermWeight.ONE   # term weighting strategy
alpha = 0.1              # priors for document-topic and topic-word distributions
eta = 0.01               # priors for document-topic and topic-word distributions
tol = 1e-3               # convergence tolerance

In [29]:
%%time

mdl = tp.LDAModel(k=k, min_df=min_df, rm_top=rm_top, tw=tw, alpha=alpha, eta=eta)

for doc in df['tokens']:
    if doc:
        mdl.add_doc(doc)

last = np.NINF
for i in range(0, 5000, 50):
    mdl.train(50)
    ll = mdl.ll_per_word
    print(f'{i:5d} LL = {ll:7.4f}', flush=True)
    if ll - last < tol:
        break
    else:
        last = ll

print(f'Done!')

    0 LL = -8.1788
   50 LL = -7.9913
  100 LL = -7.9270
  150 LL = -7.8978
  200 LL = -7.8778
  250 LL = -7.8658
  300 LL = -7.8587
  350 LL = -7.8515
  400 LL = -7.8477
  450 LL = -7.8419
  500 LL = -7.8384
  550 LL = -7.8353
  600 LL = -7.8329
  650 LL = -7.8314
  700 LL = -7.8314
Done!
CPU times: user 37min 2s, sys: 19.8 s, total: 37min 22s
Wall time: 12min 58s


In [None]:
# print out the results of the above processes
# NOTE: This is 200 lines long
for k in range(mdl.k):
    print(f'{k:3d} ', ', '.join(s for s,_ in mdl.get_topic_words(k)))

In [86]:
mdl.save('article-topics.bin')

# If you have article-topics.bin, skip the above 3 blocks and run the code below.
### If you had to run the above code, there is no need to run the next line

In [None]:
mdl = tp.LDAModel.load('article-topics.bin')

# Continue

In [78]:
df.head()

Unnamed: 0,headline,text,byline,dateline,date,lang,tokens
0,Planet Hollywood launches credit card.,If dining at Planet Hollywood made you feel li...,,LOS ANGELES,1996-08-20,ENGLISH,"[if, dining, at, planet, hollywood, made, you,..."
1,Sprint to offer consumer Internet access service.,Sprint Corp. Tuesday announced plans to offer ...,Susan Nadeau,CHICAGO,1996-08-20,ENGLISH,"[sprint, corp., tuesday, announced, plans, to,..."
2,Chains may raise prices after minimum wage hike.,The higher minimum wage signed into law Tuesda...,Patricia Commins,CHICAGO,1996-08-20,ENGLISH,"[the, higher, minimum, wage, signed, into, law..."
3,Sprint to offer consumer Internet access service.,Sprint Corp. Tuesday announced plans to offer ...,,"KANSAS CITY, Mo.",1996-08-20,ENGLISH,"[sprint, corp., tuesday, announced, plans, to,..."
4,Sprint to offer consumer Internet access service.,Sprint Corp. Tuesday announced plans to offer ...,,"KANSAS CITY, Mo.",1996-08-20,ENGLISH,"[sprint, corp., tuesday, announced, plans, to,..."


## Create a dataframe of the topics for labeling

In [32]:
# Create a dataframe of the tokens that make up each topic, only has the tokens column for now
topics = pd.DataFrame({'tokens': [' '.join(map(first, mdl.get_topic_words(k))) for k in range(mdl.k)]})

In [33]:
topics.head()

Unnamed: 0,tokens
0,pesos philippine philippines corp pldt inc fax...
1,stet italian italy lire telecom_italia state i...
2,miami county american latin_america florida ca...
3,top survey according average fund among study ...
4,customers phone service calls call telephone n...


### Create and manually update the .csv file

In [34]:
# Output the dataframe to a CSV for further manipulation
topics.to_csv('topics.csv', index=False)

#### Edit the resulting CS, manually add a column "label" and manually label as many rows as possible. Labels are determined by the user, then saved.
Aim for 10-15 labels

In [65]:
# After adding labels, read the CSV back into the dataframe
topics = pd.read_csv('topics.csv')

In [66]:
# The dataframe now has 2 columns, the original 'tokens' and the added 'label'
# Empty labels are returned as NaN, we will fix this next.
topics

Unnamed: 0,tokens,label
0,pesos philippine philippines corp pldt inc fax...,asia
1,stet italian italy lire telecom_italia state i...,telecom
2,miami county american latin_america florida ca...,america
3,top survey according average fund among study ...,
4,customers phone service calls call telephone n...,telecom
...,...,...
195,tax state city bill airport authority federal ...,
196,newspaper comment report reported declined spo...,
197,europe asia countries world international regi...,
198,shareholders meeting board shareholder vote di...,business


In [147]:
# Create a list of the labels, removing duplicates and empty values
list_of_labels = list(set(topics['label'].dropna()))

### This is just a review of what we are working with so far, using 1 article as an example

In [174]:
print('Here is the original text of the article:\n')
print(df['text'].iloc[166])

Here is the original text of the article:

French Riviera train conductors ended a 36-hour strike on Tuesday against growing crime which has badly disrupted traffic at the peak summer holiday period. A trade union spokesman said the management of state railway company SNCF had responded to the protest by promising to increase staff and equip conductors with portable telephones to tighten security on routes regarded as dangerous. Unionists say train conductors in southeast France are increasingly at risk from fare dodgers, pickpockets and thugs. The strike forced SNCF to cancel five long distance trains between Marseille and Nice and 75 percent of regional traffic on Monday. High-speed trains ran normally.


In [175]:
print('\n Here is the topics structure for that same article:\n')
print(mdl.docs[166])


 Here is the topics structure for that same article:

<tomotopy.Document with words="french riviera train conductors ended a 36 hour_strike on tuesday against growing crime which has badly disrupted traffic at the peak summer holiday period a trade_union spokesman said the management of state railway company sncf had responded to the protest by promising to increase staff and equip conductors with portable telephones to tighten security on routes regarded as dangerous unionists say train conductors in southeast france are increasingly at risk from fare dodgers pickpockets and thugs the strike forced sncf to cancel five long_distance trains between marseille and nice and 75 percent of regional traffic on monday high-speed trains ran normally">


In [184]:
print("\nHere is the list of topics and their pairs that correlates to the rows in 'topics':\n")
print(mdl.docs[166].get_topics())


Here is the list of topics and their pairs that correlates to the rows in 'topics':

[(190, 0.34260496497154236), (175, 0.20863720774650574), (39, 0.10455581545829773), (48, 0.07458535581827164), (27, 0.05965302884578705), (52, 0.04533909633755684), (197, 0.030088625848293304), (125, 0.029965976253151894), (24, 0.029858462512493134), (49, 0.015399456024169922)]


In [183]:
print('\nHere is that same list, filtered to exclude any topic with a weight less than 0.01\n')
print([t for t,w in mdl.docs[166].get_topics() if w>0.01])


Here is that same list, filtered to exclude any topic with a weight less than 0.01

[190, 175, 39, 48, 27, 52, 197, 125, 24, 49]


In [182]:
print('\nFollowed by the label that corresponds to those topics:\nNote that some are nan, which means there is no label associated with that topic.\n')
print([topics['label'].loc[t] for t,w in mdl.docs[166].get_topics() if w>0.01])


Followed by the label that corresponds to those topics:
Note that some are nan, which means there is no label associated with that topic.

['politics', nan, nan, nan, nan, nan, nan, 'telecom', 'money', 'money']


In [185]:
print('\nFinally, here is that same list of labels for topics after having nan values and duplicates removed:\n')
print(set(filter(pd.notna, [topics['label'].loc[t] for t,w in mdl.docs[166].get_topics() if w>0.01])))


Finally, here is that same list of labels for topics after having nan values and duplicates removed:

{'telecom', 'money', 'politics'}


## Process the labels for each article in the dataframe

In [144]:
def get_labels(index):
    return list(set(filter(pd.notna, [topics['label'].loc[t] for t,w in mdl.docs[index].get_topics() if w>0.01])))

In [153]:
def get_counts(df):
    from collections import Counter
    label_list = []
    no_label=0
    
    # get the labels for each element
    for index, row in df.iterrows():
        labels = get_labels(index)
        if len(labels)==0:
            no_label += 1
        else:
            label_list.extend(labels)
    
    # Create a dictionary of the counts for each label
    label_dist = dict(Counter(label_list))
    
    # Add in the count of articles with no label
    label_dist['no label']=no_label
    return label_dist

In [154]:
label_frequencies = get_counts(df)

## Deliverables
* List of labels

In [149]:
# This is just a list of the labels that I added in the 'manually update .csv file' section
print(list_of_labels)

['business', 'australia', 'europe', 'shipping', 'telecom', 'oil', 'technology', 'airline', 'america', 'asia', 'money', 'politics']


* For each label, the number of articles from the dataset that fit that label
* The number of articles that don't fit any of the labels (ideally this won't be a big number)

In [189]:
# How many articles are classified using each label
# Articles may have zero or more labels.
# If an article has more than one label, it will be counted in all of it's labels
# If an article has zero labels, it will be counted in 'no label'
label_frequencies

{'business': 21839,
 'telecom': 15000,
 'money': 28973,
 'airline': 20443,
 'politics': 5564,
 'asia': 6142,
 'technology': 4258,
 'shipping': 9107,
 'europe': 1264,
 'australia': 1337,
 'america': 808,
 'oil': 1319,
 'no label': 3659}

* Annotated notebook showing your process