In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/million-headlines/abcnews-date-text.csv


In [2]:
df = pd.read_csv("/kaggle/input/million-headlines/abcnews-date-text.csv")
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244184 entries, 0 to 1244183
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1244184 non-null  int64 
 1   headline_text  1244184 non-null  object
dtypes: int64(1), object(1)
memory usage: 19.0+ MB


In [4]:
df.loc[0, 'headline_text']

'aba decides against community broadcasting licence'

Get a sample data to quickly iterate on approaches

In [5]:
sample_df = df.sample(frac=0.01)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12442 entries, 555280 to 423367
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   publish_date   12442 non-null  int64 
 1   headline_text  12442 non-null  object
dtypes: int64(1), object(1)
memory usage: 291.6+ KB


## Bag of Words Model

First thing we will do is to apply a basic count vectorizer to generate one hot encoded represantation of our vocabulary and data

In [6]:
from sklearn.feature_extraction.text import CountVectorizer


def count_vectorizer_fn(**kwargs):
    cv = CountVectorizer(**kwargs)
    dt = cv.fit_transform(sample_df['headline_text'])
    return dt

In [7]:
count_vectorizer_fn()

<12442x14416 sparse matrix of type '<class 'numpy.int64'>'
	with 81039 stored elements in Compressed Sparse Row format>

12k rows and 14k vocab size for the sample. This is not exactly scalable to bigger data as vocab size would get bigger. We can try to optimize our text as we didn't do much of that now

## Reducing Feature Dimension

### Stopwords

In [8]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords


print(len(stopwords))

326


In [9]:
count_vectorizer_fn(stop_words=list(stopwords))



<12442x14198 sparse matrix of type '<class 'numpy.int64'>'
	with 65248 stored elements in Compressed Sparse Row format>

This didn't reduce out dataset size much as we still have about 14k vocab size

### Minimum Document Frequency

In [10]:
count_vectorizer_fn(stop_words=list(stopwords), min_df=2)

<12442x6889 sparse matrix of type '<class 'numpy.int64'>'
	with 57939 stored elements in Compressed Sparse Row format>

In [11]:
count_vectorizer_fn(stop_words=list(stopwords), min_df=0.0001)

<12442x6889 sparse matrix of type '<class 'numpy.int64'>'
	with 57939 stored elements in Compressed Sparse Row format>

Setting a min_df reduces our vocab size by half!! which is a big improvement in terms of scalability 

### Maximum Document Frequency

We might have a corpus with a lot of repeating terms so we can remove them via max_df parameter.

In [12]:
count_vectorizer_fn(stop_words=list(stopwords), max_df=0.1)

<12442x14198 sparse matrix of type '<class 'numpy.int64'>'
	with 65248 stored elements in Compressed Sparse Row format>

Okay that didnt reduce anything? This is because of stop words usage. Using stop words is often the better choice.

In [13]:
count_vectorizer_fn(max_df=0.1)

<12442x14413 sparse matrix of type '<class 'numpy.int64'>'
	with 75808 stored elements in Compressed Sparse Row format>

### N-Grams

We could apply a simple logic of using group of words togeter in the vocabulary. This would of course increase the size of data but would provide good information to us

In [14]:
count_vectorizer_fn(stop_words=list(stopwords), min_df=2, ngram_range=(1, 2))

<12442x9763 sparse matrix of type '<class 'numpy.int64'>'
	with 66475 stored elements in Compressed Sparse Row format>

In [15]:
count_vectorizer_fn(stop_words=list(stopwords), min_df=2, ngram_range=(1, 3))

<12442x10073 sparse matrix of type '<class 'numpy.int64'>'
	with 67236 stored elements in Compressed Sparse Row format>

## TF-IDF Models

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2, ngram_range=(1, 2))
dt = tfidf.fit_transform(sample_df['headline_text'])
dt

<12442x9763 sparse matrix of type '<class 'numpy.float64'>'
	with 66475 stored elements in Compressed Sparse Row format>

## Lemmas

We can apply some linguistic techniques to improve our features. We will be adding lemmatized words. News headlines applying the lemmatization probably won't lose information value


In [21]:
from tqdm.auto import tqdm

import spacy


nlp = spacy.load("en_core_web_sm")
nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]

In [22]:
for i, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    doc = nlp(str(row["headline_text"]))
    sample_df.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    sample_df.at[i, "nav"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

  0%|          | 0/12442 [00:00<?, ?it/s]

### Using Lemmas instead of Words for Vectorizing

In [23]:
tfidf = TfidfVectorizer(stop_words=list(stopwords))
dt = tfidf.fit_transform(sample_df["lemmas"].map(str))
dt



<12442x11544 sparse matrix of type '<class 'numpy.float64'>'
	with 64085 stored elements in Compressed Sparse Row format>

### Limit Word Types

Focusing on set of words such as nouns could be better

In [24]:
tfidf = TfidfVectorizer(stop_words=list(stopwords))
dt = tfidf.fit_transform(sample_df["nav"].map(str))
dt

<12442x11265 sparse matrix of type '<class 'numpy.float64'>'
	with 62853 stored elements in Compressed Sparse Row format>

### Remove Most Common Words

Let's use the most common 10,000 words to remove it from our vocabulary

In [25]:
top_10000 = pd.read_csv("https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt", header=None)

In [26]:
tfidf = TfidfVectorizer(stop_words=list(set(top_10000.iloc[:, 0].values)))
dt = tfidf.fit_transform(sample_df["nav"].map(str))
dt

<12442x7004 sparse matrix of type '<class 'numpy.float64'>'
	with 15768 stored elements in Compressed Sparse Row format>

### Adding Context via N-Gram

In [27]:
tfidf = TfidfVectorizer(stop_words=list(set(top_10000.iloc[:, 0].values)), min_df=2, ngram_range=(1, 2))
dt = tfidf.fit_transform(sample_df["nav"].map(str))
dt

<12442x2536 sparse matrix of type '<class 'numpy.float64'>'
	with 11470 stored elements in Compressed Sparse Row format>

## Syntactic Similarity in the Dataset

Let's take a look at finding similar documents in our dataset

### Finding Most Similar Headlines to a Made-Up Headline

Image you want to find a headline in our data  that matches to headline we remember. Let's try to do this

In [28]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)
dt = tfidf.fit_transform(sample_df["lemmas"].map(str))
dt



<12442x5709 sparse matrix of type '<class 'numpy.float64'>'
	with 58250 stored elements in Compressed Sparse Row format>

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

made_up = tfidf.transform(["australia and new zealand discuss optimal apple size"])

sim = cosine_similarity(made_up, dt)
sim[0]

array([0.        , 0.07257882, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [30]:
sample_df.iloc[np.argsort(sim[0])[::-1][0:5]][["publish_date", "lemmas"]]

Unnamed: 0,publish_date,lemmas
633450,20110818,federal push to ban new zealand apple
552450,20100724,new headache for apple as iphone 4 delay
1238723,20210722,fire the size of los angeles rip through oregon
626690,20110714,new label to hit aussie apple
1202108,20200610,new zealand art music open up impact on austra...


### Finding the Two Most Similar Documents in a Large Corpus

Our main dataset has about 1m rows and converting to vectorized data and doing the dot product to compute cosine similarity might be a start however, dot product of 1m with 1m would be more than 1 trillion rows of data which is not RAM efficient. We know that similarity matrix has duplicated entries (a and b vs b and a) which means our final similartiy matrix, we can skip computation of half of rows which reduces the ram size by 50% but now we have a compute issue at hand cuz we want to skip certain computations. To do that in a vectorized way, we can compute the similarity matrix in a submatrix fashion. Meaning we would take a subset, do similarity computation instead of going through the data at once. 

In [31]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2, norm='l2')
dt = tfidf.fit_transform(sample_df["headline_text"])



#### Timing Cosing Similarity

In [32]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000], dense_output=False)

CPU times: user 30.9 ms, sys: 12 ms, total: 43 ms
Wall time: 41.2 ms


<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1320492 stored elements in Compressed Sparse Row format>

In [33]:
%%time
r = cosine_similarity(dt[0:10000], dt[0:10000])
r[r > 0.9999] = 0
print(np.argmax(r))

43206829
CPU times: user 569 ms, sys: 1.76 s, total: 2.33 s
Wall time: 2.33 s


In [34]:
%%time
r = cosine_similarity(dt[0:10000], dt[0:10000], dense_output=False)
r[r > 0.9999] = 0
print(np.argmax(r))

43206829
CPU times: user 370 ms, sys: 101 ms, total: 471 ms
Wall time: 470 ms


#### Timing Dot Product

In [35]:
%%time
r = np.dot(dt[0:10000], np.transpose(dt[0:10000]))
r[r > 0.9999] = 0
print(np.argmax(r))

43206829
CPU times: user 335 ms, sys: 29 ms, total: 364 ms
Wall time: 362 ms


#### Batch Data

In [49]:
# there are "test" headlines in the corpus
stopwords.add("test")
tfidf = TfidfVectorizer(stop_words=list(stopwords), ngram_range=(1,2), min_df=2, norm='l2')
dt = tfidf.fit_transform(df["headline_text"])     # Use the whole data

In [50]:
%%time
batch = 10000
max_sim = 0.0
max_a = None
max_b = None

for a in tqdm(range(0, dt.shape[0], batch)):
    for b in range(0, a+batch, batch):
        # r = np.dot(dt[a:a+batch], np.transpose(dt[b:b+batch]))
        r = cosine_similarity(dt[a:a+batch], dt[b:b+batch], dense_output=False)
        # eliminate identical vectors
        # by setting their similarity to np.nan which gets sorted out
        r[r > 0.9999] = 0
        sim = r.max()
        if sim > max_sim:
            # argmax returns a single value which we have to 
            # map to the two dimensions            
            (max_a, max_b) = np.unravel_index(np.argmax(r), r.shape)
            # adjust offsets in corpus (this is a submatrix)
            max_a += a
            max_b += b
            max_sim = sim

  0%|          | 0/125 [00:00<?, ?it/s]

CPU times: user 11min, sys: 894 ms, total: 11min 1s
Wall time: 11min 1s


In [51]:
print(max_a, max_b)

1131410 471938


In [52]:
print(max_sim)

0.9898072551008147


In [53]:
df.iloc[[max_a, max_b]][["publish_date", "headline_text"]]

Unnamed: 0,publish_date,headline_text
1131410,20180620,money money money
471938,20090630,money in money out


### Finding Related Words

Words should be more related if they appear together in the corpus. 

In [45]:
tfidf_word = TfidfVectorizer(stop_words=list(stopwords), min_df=1000)       # Words should appear more
dt_word = tfidf_word.fit_transform(df["headline_text"])

In [46]:
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r, 0)

In [47]:
voc = tfidf_word.get_feature_names_out()
size = r.shape[0] # quadratic

for index in np.argsort(r.flatten())[::-1][0:20]:
    a = int(index / size)
    b = index % size
    if a > b:  # avoid repetitions
        print('"%s" related to "%s"' % (voc[a], voc[b]))

"kong" related to "hong"
"sri" related to "lanka"
"covid" related to "19"
"seekers" related to "asylum"
"springs" related to "alice"
"trump" related to "donald"
"hour" related to "country"
"pleads" related to "guilty"
"hill" related to "broken"
"vs" related to "summary"
