In [1]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

--2024-06-11 19:26:55--  https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5085081 (4.8M) [text/csv]
Saving to: ‘bbc_text_cls.csv’


2024-06-11 19:26:55 (42.9 MB/s) - ‘bbc_text_cls.csv’ saved [5085081/5085081]



In [2]:
import numpy as np
import pandas as pd
import textwrap
import nltk
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [3]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
df = pd.read_csv("bbc_text_cls.csv")

In [6]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [7]:
labels = set(df["labels"])
labels

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [10]:
# Pick a label to train
label = "entertainment"

In [12]:
texts = df[df["labels"] == label]["text"]
texts.head()

510    Gallery unveils interactive tree\n\nA Christma...
511    Jarre joins fairytale celebration\n\nFrench mu...
512    Musical treatment for Capra film\n\nThe classi...
513    Richard and Judy choose top books\n\nThe 10 au...
514    Poppins musical gets flying start\n\nThe stage...
Name: text, dtype: object

In [14]:
texts[510]

'Gallery unveils interactive tree\n\nA Christmas tree that can receive text messages has been unveiled at London\'s Tate Britain art gallery.\n\nThe spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate. The messages will be "unwrapped" by sculptor Richard Wentworth, who is responsible for decorating the tree with broken plates and light bulbs. It is the 17th year that the gallery has invited an artist to dress their Christmas tree. Artists who have decorated the Tate tree in previous years include Tracey Emin in 2002.\n\nThe plain green Norway spruce is displayed in the gallery\'s foyer. Its light bulb adornments are dimmed, ordinary domestic ones joined together with string. The plates decorating the branches will be auctioned off for the children\'s charity ArtWorks. Wentworth worked as an assistant to sculptor Henry Moore in the late 1960s. His reputation as a sculptor grew in the 1980s, while he has been one of the most influential teachers during th

In [15]:
# collect counts
probs = {} # key: (w(t-1), w(t+1)), value: {w(t): count(w(t))}

for doc in texts:
  lines = doc.split("\n")
  for line in lines:
    tokens = word_tokenize(line)
    for i in range(len(tokens) - 2):
      t_0 = tokens[i]
      t_1 = tokens[i+1]
      t_2 = tokens[i+2]
      key = (t_0, t_2)
      if key not in probs:
        probs[key] = {}

      # add count for middle token
      if t_1 not in probs[key]:
        probs[key][t_1] = 1
      else:
        probs[key][t_1] += 1

In [16]:
probs

{('Gallery', 'interactive'): {'unveils': 1},
 ('unveils', 'tree'): {'interactive': 1},
 ('A', 'tree'): {'Christmas': 1},
 ('Christmas', 'that'): {'tree': 1},
 ('tree', 'can'): {'that': 1},
 ('that', 'receive'): {'can': 1},
 ('can', 'text'): {'receive': 1},
 ('receive', 'messages'): {'text': 1},
 ('text', 'has'): {'messages': 1},
 ('messages', 'been'): {'has': 1},
 ('has', 'unveiled'): {'been': 1},
 ('been', 'at'): {'unveiled': 1,
  'shown': 1,
  'estimated': 1,
  'difficult': 1,
  'surprised': 1,
  'announced': 1,
  'honoured': 2},
 ('unveiled', 'London'): {'at': 1},
 ('at', "'s"): {'London': 11,
  'Versace': 1,
  'Sotheby': 1,
  'Sunday': 4,
  'Hollywood': 2,
  'Grauman': 1,
  'Israel': 2,
  'Glasgow': 1,
  'Scotland': 2,
  'Dublin': 1,
  'Guernsey': 1,
  'MTV': 1,
  'Oxfam': 1,
  'France': 1,
  'Wednesday': 1,
  '1985': 1,
  'Sadler': 2,
  'Toronto': 1,
  'Bristol': 1,
  'June': 1},
 ('London', 'Tate'): {"'s": 1},
 ("'s", 'Britain'): {'Tate': 1},
 ('Tate', 'art'): {'Britain': 1},
 ('

In [17]:
# normalize probabilities
for key, d in probs.items():
  # d should represent a distribution
  total = sum(d.values())
  for k, v in d.items():
    d[k] = v / total

In [18]:
probs

{('Gallery', 'interactive'): {'unveils': 1.0},
 ('unveils', 'tree'): {'interactive': 1.0},
 ('A', 'tree'): {'Christmas': 1.0},
 ('Christmas', 'that'): {'tree': 1.0},
 ('tree', 'can'): {'that': 1.0},
 ('that', 'receive'): {'can': 1.0},
 ('can', 'text'): {'receive': 1.0},
 ('receive', 'messages'): {'text': 1.0},
 ('text', 'has'): {'messages': 1.0},
 ('messages', 'been'): {'has': 1.0},
 ('has', 'unveiled'): {'been': 1.0},
 ('been', 'at'): {'unveiled': 0.125,
  'shown': 0.125,
  'estimated': 0.125,
  'difficult': 0.125,
  'surprised': 0.125,
  'announced': 0.125,
  'honoured': 0.25},
 ('unveiled', 'London'): {'at': 1.0},
 ('at', "'s"): {'London': 0.2972972972972973,
  'Versace': 0.02702702702702703,
  'Sotheby': 0.02702702702702703,
  'Sunday': 0.10810810810810811,
  'Hollywood': 0.05405405405405406,
  'Grauman': 0.02702702702702703,
  'Israel': 0.05405405405405406,
  'Glasgow': 0.02702702702702703,
  'Scotland': 0.05405405405405406,
  'Dublin': 0.02702702702702703,
  'Guernsey': 0.0270270

In [19]:
texts.iloc[0].split("\n")

['Gallery unveils interactive tree',
 '',
 "A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery.",
 '',
 'The spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate. The messages will be "unwrapped" by sculptor Richard Wentworth, who is responsible for decorating the tree with broken plates and light bulbs. It is the 17th year that the gallery has invited an artist to dress their Christmas tree. Artists who have decorated the Tate tree in previous years include Tracey Emin in 2002.',
 '',
 "The plain green Norway spruce is displayed in the gallery's foyer. Its light bulb adornments are dimmed, ordinary domestic ones joined together with string. The plates decorating the branches will be auctioned off for the children's charity ArtWorks. Wentworth worked as an assistant to sculptor Henry Moore in the late 1960s. His reputation as a sculptor grew in the 1980s, while he has been one of the most influential te

In [20]:
def spin_document(doc):
  # split the document into lines (paragraphs)
  lines = doc.split("\n")
  output = []
  for line in lines:
    if line:
      new_line = spin_line(line)
    else:
      new_line = line

    output.append(new_line)
  return "\n".join(output)

In [21]:
detokenizer = TreebankWordDetokenizer()

In [22]:
texts.iloc[0].split("\n")[2]

"A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery."

In [23]:
detokenizer.detokenize(word_tokenize(texts.iloc[0].split("\n")[2]))

"A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery."

In [24]:
def sample_word(d):
  p0 = np.random.random()
  cumulative = 0
  for t, p in d.items():
    cumulative += p
    if p0 < cumulative:
      return t
  assert(False) # should never get there

In [25]:
def spin_line(line):
  tokens = word_tokenize(line)
  i = 0
  output = [tokens[0]]
  while i < (len(tokens) - 2):
    t_0 = tokens[i]
    t_1 = tokens[i + 1]
    t_2 = tokens[i + 2]
    key = (t_0, t_2)
    p_dist = probs[key]
    if len(p_dist) > 1 and np.random.random() < 0.3:
      # let's replace the middle word
      middle = sample_word(p_dist)
      output.append(t_1)
      output.append("<" + middle + ">")
      output.append(t_2)


      # we won't replace the 3rd token since the middle
      # token was dependent on it
      # instead, skip ahead 2 steps
      i += 2
    else:
      # we won't replace this middle word
      output.append(t_1)
      i += 1
  # append the final token - only if there was no replacement
  if i == len(tokens) - 2:
    output.append(tokens[-1])
  return detokenizer.detokenize(output)

In [26]:
np.random.seed(1234)

In [27]:
i = np.random.choice(texts.shape[0])
doc = texts.iloc[i]
new_doc = spin_document(doc)

In [28]:
print(textwrap.fill(new_doc, replace_whitespace=False, fix_sentence_endings=True))

Fantasy book wins Hollywood deal

A British author has had the film
rights to her children <father>'s bestseller snapped up for a seven-
figure sum, with Ridley Scott set <set> to direct <apologise>.
Michelle Paver's Wolf Brother, a fantasy set 6,000 years ago, is <and>
the first in <year> a planned <planned> series of six books . Film
<The> studio Fox has bought the rights for around $4m (£2.13m) for
Scott's company Scott Free to develop <8-1>. The director said he was
"thrilled" with <of> the project . "Wolf Brother is an enchanting
book," he said <said>. Paver, who lives in London <Hindi> and
previously <also> worked as a lawyer, began writing the book
<Children> in 1982 <1982> while studying biochemistry at Oxford
University.

She was an established author of love stories when she
turned <accepted> the work-in-progress into a children's novel
<career>. It was published in 2004, with Paver earning an advance of
$5m <305m> (£2.8m <£293,000>) - the highest sum ever paid for a debut
ch