In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
## for data
import json
import collections, re
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
## for bag-of-words, tf-idf
from sklearn import feature_extraction 
# model_selection, naive_bayes, pipeline, manifold, preprocessing
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

### NLTK 
**NLTK** is a leading platform for building Python programs to work with human language data.
https://www.nltk.org/
http://www.nltk.org/book/

In [8]:
## for NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/max-
[nltk_data]     omelchenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
lst_dics = []
with open('data/News_Category_Dataset_v2.json', mode='r', errors='ignore') as json_file:
    for dic in json_file:
        lst_dics.append( json.loads(dic) )
## print the first one
lst_dics[:2]

[{'category': 'CRIME',
  'headline': 'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV',
  'authors': 'Melissa Jeltsen',
  'link': 'https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89',
  'short_description': 'She left her husband. He killed their children. Just another day in America.',
  'date': '2018-05-26'},
 {'category': 'ENTERTAINMENT',
  'headline': "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song",
  'authors': 'Andy McDonald',
  'link': 'https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201',
  'short_description': 'Of course it has a song.',
  'date': '2018-05-26'}]

In [10]:
## create dtf
dtf = pd.DataFrame(lst_dics)
## filter categories
dtf = dtf[ dtf["category"].isin(['ENTERTAINMENT','POLITICS','TECH']) ][["category","headline","short_description"]]
## rename columns
dtf = dtf.rename(columns={"category":"y", "headline":"text", "short_description":"short_description"})
## print 5 random rows
dtf.sample(5)

Unnamed: 0,y,text,short_description
60786,POLITICS,Trump Terrifies World Leaders,President Barack Obama is trying but failing t...
4038,POLITICS,1 Typo Makes Sean Spicer's Rex Tillerson Tweet...,He came to praise the former secretary of stat...
46137,POLITICS,HUFFPOST HILL: 'These Aren't The Gaffes You're...,Like what you read below? Sign up for HUFFPOST...
87896,POLITICS,"War, Murder and the American Way","Today, our national numbness is wrapped in a C..."
45171,POLITICS,Donald Trump Faces Payback In The Desert,"If the presidential race in Arizona is close, ..."


In [11]:
c=dtf.loc[139122,'short_description']

In [12]:
c

'It was day one of trying on her new brown locks as Anastasia Steele in "Fifty Shades of Grey," a role that could come to'

Try word_tokenize from `nltk`

In [13]:
print(word_tokenize(c))

['It', 'was', 'day', 'one', 'of', 'trying', 'on', 'her', 'new', 'brown', 'locks', 'as', 'Anastasia', 'Steele', 'in', '``', 'Fifty', 'Shades', 'of', 'Grey', ',', "''", 'a', 'role', 'that', 'could', 'come', 'to']


Try `split` for tokenizing

In [14]:
print(c.split())

['It', 'was', 'day', 'one', 'of', 'trying', 'on', 'her', 'new', 'brown', 'locks', 'as', 'Anastasia', 'Steele', 'in', '"Fifty', 'Shades', 'of', 'Grey,"', 'a', 'role', 'that', 'could', 'come', 'to']


In [15]:
token_sequence = c.split()

Create a vocabulary for `token_sequence`

In [16]:
vocab = sorted(set(token_sequence))
', '.join(vocab)

'"Fifty, Anastasia, Grey,", It, Shades, Steele, a, as, brown, come, could, day, her, in, locks, new, of, on, one, role, that, to, trying, was'

In [17]:
num_tokens = len(token_sequence)

vocab_size = len(vocab)

Create one-hot vectors for token_sequence

In [18]:
onehot_vectors = np.zeros((num_tokens, vocab_size), int)
for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1

In [19]:
onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0

Create a dataframe with vocabulary and one-hot representation for each word from the vocabulary

In [20]:
pd.DataFrame(onehot_vectors, columns=vocab)

Unnamed: 0,"""Fifty",Anastasia,"Grey,""",It,Shades,Steele,a,as,brown,come,...,locks,new,of,on,one,role,that,to,trying,was
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Create list of 'POLITICS' news from dtf

In [21]:
corpus = list(dtf[dtf['y']=='POLITICS']['short_description'][:20])

In [22]:
corpus

['Last month a Health and Human Services official revealed the government was unable to locate nearly 1,500 children who had been released from its custody.',
 'The wiretaps feature conversations between Alexander Torshin and Alexander Romanov, a convicted Russian money launderer.',
 "But don't count on Robert Mueller to nail him, the NSA whistleblower warns.",
 'Just a peeping minute.',
 'Irish women will no longer have to travel to the United Kingdom to end their pregnancies.',
 'The interior secretary attempts damage control with hunting and fishing groups that didn’t like his fossil fuel focus.',
 'And there are four times as many male as female executives.',
 'A new law to fight sex trafficking targets some of the people it ostensibly aims to protect.',
 'For Curry and others, being Christian means rejecting white nationalism and misogyny, while protecting immigrants, refugees and the poor.',
 'The Chinese Exclusion Act barely gets mentioned in U.S. history classes. A new PBS docu

### "Stop words" processing

In [23]:
nltk.download('stopwords')
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/max-
[nltk_data]     omelchenko/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
# random sentecnce with lot of stop words
sample_text = corpus[4]
text_tokens = word_tokenize(sample_text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]

print(text_tokens)
print(tokens_without_sw)

['Irish', 'women', 'will', 'no', 'longer', 'have', 'to', 'travel', 'to', 'the', 'United', 'Kingdom', 'to', 'end', 'their', 'pregnancies', '.']
['Irish', 'women', 'longer', 'travel', 'United', 'Kingdom', 'end', 'pregnancies', '.']


### Create bigrams

In [25]:
list(ngrams(text_tokens, 2))

[('Irish', 'women'),
 ('women', 'will'),
 ('will', 'no'),
 ('no', 'longer'),
 ('longer', 'have'),
 ('have', 'to'),
 ('to', 'travel'),
 ('travel', 'to'),
 ('to', 'the'),
 ('the', 'United'),
 ('United', 'Kingdom'),
 ('Kingdom', 'to'),
 ('to', 'end'),
 ('end', 'their'),
 ('their', 'pregnancies'),
 ('pregnancies', '.')]

### Create Bag of Words by using `CountVectorizer()` from `sklearn`

In [26]:
import pprint

In [27]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
word_list = vectorizer.get_feature_names()
count_list = X.toarray().sum(axis=0)    

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(dict(zip(word_list,count_list)))

{   '12': 1,
    '500': 1,
    'about': 1,
    'act': 1,
    'actions': 1,
    'activist': 1,
    'affect': 1,
    'aims': 1,
    'alexander': 2,
    'an': 1,
    'and': 10,
    'anger': 1,
    'are': 2,
    'as': 2,
    'ask': 1,
    'assange': 1,
    'assault': 1,
    'at': 1,
    'attempts': 1,
    'away': 1,
    'barely': 1,
    'based': 1,
    'been': 2,
    'being': 3,
    'between': 1,
    'book': 1,
    'bots': 1,
    'burns': 1,
    'but': 2,
    'calling': 1,
    'catholic': 1,
    'change': 1,
    'children': 1,
    'chinese': 1,
    'christian': 1,
    'cited': 1,
    'classes': 1,
    'concerned': 1,
    'conscience': 1,
    'contact': 1,
    'control': 2,
    'conversations': 1,
    'convicted': 1,
    'could': 1,
    'couldn': 1,
    'count': 1,
    'curry': 1,
    'custody': 1,
    'damage': 1,
    'day': 1,
    'democracy': 1,
    'denounced': 1,
    'didn': 1,
    'directors': 1,
    'documentary': 1,
    'don': 1,
    'doors': 1,
    'each': 1,
    'effect': 1,
    '

### Use Counter() from collection for Bag of Words creation

In [28]:
bagofwords = [collections.Counter(re.findall(r'\w+', txt)) for txt in corpus]

In [29]:
bagofwords[0]

Counter({'Last': 1,
         'month': 1,
         'a': 1,
         'Health': 1,
         'and': 1,
         'Human': 1,
         'Services': 1,
         'official': 1,
         'revealed': 1,
         'the': 1,
         'government': 1,
         'was': 1,
         'unable': 1,
         'to': 1,
         'locate': 1,
         'nearly': 1,
         '1': 1,
         '500': 1,
         'children': 1,
         'who': 1,
         'had': 1,
         'been': 1,
         'released': 1,
         'from': 1,
         'its': 1,
         'custody': 1})

In [30]:
bagofwords[1]

Counter({'The': 1,
         'wiretaps': 1,
         'feature': 1,
         'conversations': 1,
         'between': 1,
         'Alexander': 2,
         'Torshin': 1,
         'and': 1,
         'Romanov': 1,
         'a': 1,
         'convicted': 1,
         'Russian': 1,
         'money': 1,
         'launderer': 1})

In [31]:
bagofwords[2]

Counter({'But': 1,
         'don': 1,
         't': 1,
         'count': 1,
         'on': 1,
         'Robert': 1,
         'Mueller': 1,
         'to': 1,
         'nail': 1,
         'him': 1,
         'the': 1,
         'NSA': 1,
         'whistleblower': 1,
         'warns': 1})

In [32]:
### Example with tf-idf 

In [33]:
s0=corpus[1]
s1 = corpus[8]
s2=corpus[16]

sent=[s0, s1, s2]
print(sent)

['The wiretaps feature conversations between Alexander Torshin and Alexander Romanov, a convicted Russian money launderer.', 'For Curry and others, being Christian means rejecting white nationalism and misogyny, while protecting immigrants, refugees and the poor.', "Unions denounced the president's actions an “assault on democracy.”"]


In [34]:
vectorizer = TfidfVectorizer()
response = vectorizer.fit_transform([s0,s1, s2])
features=vectorizer.get_feature_names()
x=response.toarray()

In [35]:
print(features)

['actions', 'alexander', 'an', 'and', 'assault', 'being', 'between', 'christian', 'conversations', 'convicted', 'curry', 'democracy', 'denounced', 'feature', 'for', 'immigrants', 'launderer', 'means', 'misogyny', 'money', 'nationalism', 'on', 'others', 'poor', 'president', 'protecting', 'refugees', 'rejecting', 'romanov', 'russian', 'the', 'torshin', 'unions', 'while', 'white', 'wiretaps']


In [36]:
pd.DataFrame(data=x, columns=features)

Unnamed: 0,actions,alexander,an,and,assault,being,between,christian,conversations,convicted,...,refugees,rejecting,romanov,russian,the,torshin,unions,while,white,wiretaps
0,0.0,0.517655,0.0,0.196845,0.0,0.0,0.258828,0.0,0.258828,0.258828,...,0.0,0.0,0.258828,0.258828,0.152868,0.258828,0.0,0.0,0.0,0.258828
1,0.0,0.0,0.0,0.503249,0.0,0.22057,0.0,0.22057,0.0,0.0,...,0.22057,0.22057,0.0,0.0,0.130272,0.0,0.0,0.22057,0.22057,0.0
2,0.346089,0.0,0.346089,0.0,0.346089,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.204405,0.0,0.346089,0.0,0.0,0.0


In [37]:
print('s2:',features[0],'-', response[(2,0)])

s2: actions - 0.3460885720028406


In [38]:
print('s0:',features[1],'-', response[(0,1)])

s0: alexander - 0.5176550192273711


In [39]:
print('s1:',features[7],'-', response[(1,7)])

s1: christian - 0.22057046589571722


In [40]:
x=response.toarray()

In [41]:
x[0]

array([0.        , 0.51765502, 0.        , 0.19684499, 0.        ,
       0.        , 0.25882751, 0.        , 0.25882751, 0.25882751,
       0.        , 0.        , 0.        , 0.25882751, 0.        ,
       0.        , 0.25882751, 0.        , 0.        , 0.25882751,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.25882751, 0.25882751,
       0.1528677 , 0.25882751, 0.        , 0.        , 0.        ,
       0.25882751])

In [42]:
x[1]

array([0.        , 0.        , 0.        , 0.50324857, 0.        ,
       0.22057047, 0.        , 0.22057047, 0.        , 0.        ,
       0.22057047, 0.        , 0.        , 0.        , 0.22057047,
       0.22057047, 0.        , 0.22057047, 0.22057047, 0.        ,
       0.22057047, 0.        , 0.22057047, 0.22057047, 0.        ,
       0.22057047, 0.22057047, 0.22057047, 0.        , 0.        ,
       0.13027247, 0.        , 0.        , 0.22057047, 0.22057047,
       0.        ])

In [43]:
x[2]

array([0.34608857, 0.        , 0.34608857, 0.        , 0.34608857,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.34608857, 0.34608857, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.34608857, 0.        , 0.        , 0.34608857,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.20440549, 0.        , 0.34608857, 0.        , 0.        ,
       0.        ])

### Calculate cosine similarity for sentences

In [44]:
from numpy.linalg import norm
cos_sim = np.dot(x[0], x[1])/(norm(x[0])*norm(x[1]))

In [45]:
cos_sim

0.11897641420763513

In [46]:
cos_sim2 = np.dot(x[0], x[2])/(norm(x[0])*norm(x[2]))
cos_sim2

0.031246995803410012

In [47]:
cos_sim3 = np.dot(x[1], x[2])/(norm(x[1])*norm(x[2]))
cos_sim3

0.02662840759271147