[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mansaluke/newsai/blob/master/notebooks/nlp_advanced.ipynb)


In [1]:
!pip -q install transformers
!pip -q install git+https://github.com/mansaluke/newsai.git

[K     |████████████████████████████████| 665kB 6.3MB/s 
[K     |████████████████████████████████| 3.8MB 37.2MB/s 
[K     |████████████████████████████████| 1.1MB 42.8MB/s 
[K     |████████████████████████████████| 890kB 44.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2MB 5.7MB/s 
[K     |████████████████████████████████| 256kB 24.0MB/s 
[K     |████████████████████████████████| 153kB 38.3MB/s 
[?25h  Building wheel for newsai (setup.py) ... [?25l[?25hdone
  Building wheel for idna-ssl (setup.py) ... [?25l[?25hdone


In [0]:
%matplotlib inline

In [3]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from newsai import dfconvert
from newsai.utils.nlp import *
import transformers
from tqdm import tqdm
from IPython.display import display, HTML
from transformers import pipeline

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
import torch
torch.cuda.is_available()

True

In [4]:
from newsai import _DATA_PATH
if not os.path.exists(_DATA_PATH):
  import requests
  URL = 'https://raw.githubusercontent.com/mansaluke/newsai/master/data/sample_historicals.csv'
  _DATA_PATH = dfconvert.mkdir_p(os.path.expanduser("~/.newsai"))
  response = requests.get(URL, stream=True)
  response.raise_for_status()
  file_size = int(response.headers["Content-Length"])

  with tqdm(total=file_size) as pbar:
    with open(os.path.join(_DATA_PATH, os.path.basename(URL)), 'wb') as handle:
        for block in response.iter_content(1024):
            handle.write(block)
            pbar.update(1024)

2728960it [00:00, 17327277.83it/s]                             


In [6]:
fpath = os.path.join(_DATA_PATH, 'sample_historicals.csv')
df = pd.read_csv(fpath, encoding = "UTF-8", parse_dates=['date'])
df = remove_null_rows(df, ["H1", "H2"])

2020-05-29 16:18:28,579 - newsai.utils.nlp - INFO - Removing 1010 rows with nulls


In [7]:
from transformers import AutoModelWithLMHead, AutoTokenizer

model = AutoModelWithLMHead.from_pretrained("bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("bart-large-cnn")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1300.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1625270765.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [0]:
def dataframe_text_summarizer(txt, var=0.1):
    txt = str(txt)
    txt = tokenizer.encode("summarize: " + txt, return_tensors="pt", max_length=512)
    min_length = max(round(len(txt[0]) * (var)), 1)
    max_length = max(round(len(txt[0]) * (1-var)), 1)
    return tokenizer.decode(
        model.generate(txt, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True
                       ).tolist()[0]
    )

In [0]:
df = df[:5]

In [0]:
# p = %prun -r df['H1_summary'] = df.H1.apply(lambda x: dataframe_text_summarizer(x))

In [0]:
# p.sort_stats('time', 'cumtime').print_stats()

In [12]:
%time df['H1_summary'] = df.H1.apply(lambda x: dataframe_text_summarizer(x))

CPU times: user 25.5 s, sys: 162 ms, total: 25.7 s
Wall time: 25.9 s


In [13]:
nlp_sentence_classif = pipeline('sentiment-analysis')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…




In [14]:
%time df['H1_sentiment'] = df.H1.apply(lambda x: nlp_sentence_classif(x))

CPU times: user 407 ms, sys: 306 µs, total: 407 ms
Wall time: 414 ms


In [15]:
display(HTML(df[['H1', 'H1_summary', 'H1_sentiment']].head(10).to_html()))

Unnamed: 0,H1,H1_summary,H1_sentiment
0,"Attacks have been traditionally underreported, but fighting the scourge in hate crimes begins with better data, experts say.","</s><s>Fighting the scourge in hate crimes begins with better data, experts say.","[{'label': 'NEGATIVE', 'score': 0.992476224899292}]"
1,"He deftly manipulated electronic images to produce otherworldly, sometimes jarring visions. He also founded, with his wife, a landmark performance space.","</s><s>He deftly manipulated electronic images to produce otherworldly, sometimes jarring visions. He also founded, with his wife, a landmark performance","[{'label': 'POSITIVE', 'score': 0.9991649389266968}]"
2,"The past definitely wasn’t past for directors like Quentin Tarantino, Martin Scorsese and Taika Waititi. Each found a way to rewrite history, though some versions were less palatable than others.","</s><s>Quentin Tarantino, Martin Scorsese and Taika Waititi are among the directors who have written about the past.","[{'label': 'NEGATIVE', 'score': 0.9796326756477356}]"
3,Forget fad diets and fitness gimmicks. Just stick to the basics.,</s><s>Forget fad diets and fitness gimmicks. Just stick to the,"[{'label': 'NEGATIVE', 'score': 0.9918994903564453}]"
4,"Linda Tracy and Peter Tracy had been married 37 years, before divorcing in 2015. When their sons began planning their own weddings, Mom and Dad rediscovered love.","</s><s>Linda Tracy and Peter Tracy had been married 37 years, before divorcing in 2015. When their sons began planning their own weddings, Mom and Dad","[{'label': 'POSITIVE', 'score': 0.9993278980255127}]"


In [16]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

label_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=998.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1334448817.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [0]:
def nlp_sentence_ner(txt):
  # Bit of a hack to get the tokens with the special tokens
  tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(txt)))
  inputs = tokenizer.encode(txt, return_tensors="pt")

  outputs = model(inputs)[0]
  predictions = torch.argmax(outputs, dim=2)

  return [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())]

In [18]:
%time df['H1_ner'] = df.H1.apply(nlp_sentence_ner)

CPU times: user 2.67 s, sys: 32.7 ms, total: 2.71 s
Wall time: 2.73 s


In [19]:
HTML(df[['H1', 'H1_ner']].to_html())

Unnamed: 0,H1,H1_ner
0,"Attacks have been traditionally underreported, but fighting the scourge in hate crimes begins with better data, experts say.","[([CLS], O), (Attack, O), (##s, O), (have, O), (been, O), (traditionally, O), (under, O), (##re, O), (##port, O), (##ed, O), (,, O), (but, O), (fighting, O), (the, O), (s, O), (##co, O), (##urge, O), (in, O), (hate, O), (crimes, O), (begins, O), (with, O), (better, O), (data, O), (,, O), (experts, O), (say, O), (., O), ([SEP], O)]"
1,"He deftly manipulated electronic images to produce otherworldly, sometimes jarring visions. He also founded, with his wife, a landmark performance space.","[([CLS], O), (He, O), (def, O), (##tly, O), (manipulated, O), (electronic, O), (images, O), (to, O), (produce, O), (other, O), (##world, O), (##ly, O), (,, O), (sometimes, O), (jar, O), (##ring, O), (visions, O), (., O), (He, O), (also, O), (founded, O), (,, O), (with, O), (his, O), (wife, O), (,, O), (a, O), (landmark, O), (performance, O), (space, O), (., O), ([SEP], O)]"
2,"The past definitely wasn’t past for directors like Quentin Tarantino, Martin Scorsese and Taika Waititi. Each found a way to rewrite history, though some versions were less palatable than others.","[([CLS], O), (The, O), (past, O), (definitely, O), (wasn, O), (’, O), (t, O), (past, O), (for, O), (directors, O), (like, O), (Quentin, I-PER), (Tara, I-PER), (##ntino, I-PER), (,, O), (Martin, I-PER), (Sc, I-PER), (##orse, I-PER), (##se, I-PER), (and, O), (Tai, I-PER), (##ka, I-PER), (Wait, I-PER), (##iti, I-PER), (., O), (Each, O), (found, O), (a, O), (way, O), (to, O), (re, O), (##w, O), (##rite, O), (history, O), (,, O), (though, O), (some, O), (versions, O), (were, O), (less, O), (p, O), (##ala, O), (##table, O), (than, O), (others, O), (., O), ([SEP], O)]"
3,Forget fad diets and fitness gimmicks. Just stick to the basics.,"[([CLS], O), (Forget, O), (f, O), (##ad, O), (diet, O), (##s, O), (and, O), (fitness, O), (g, O), (##im, O), (##mic, O), (##ks, O), (., O), (Just, O), (stick, O), (to, O), (the, O), (basic, O), (##s, O), (., O), ([SEP], O)]"
4,"Linda Tracy and Peter Tracy had been married 37 years, before divorcing in 2015. When their sons began planning their own weddings, Mom and Dad rediscovered love.","[([CLS], O), (Linda, I-PER), (Tracy, I-PER), (and, O), (Peter, I-PER), (Tracy, I-PER), (had, O), (been, O), (married, O), (37, O), (years, O), (,, O), (before, O), (di, O), (##vor, O), (##cing, O), (in, O), (2015, O), (., O), (When, O), (their, O), (sons, O), (began, O), (planning, O), (their, O), (own, O), (weddings, O), (,, O), (Mom, I-PER), (and, O), (Dad, I-PER), (red, O), (##iscovered, O), (love, O), (., O), ([SEP], O)]"


In [0]:
# use h1 to train h2?