# Using vectorizers 

In [2]:
import pandas as pd
import numpy as np
headlines = pd.read_csv('abcnews-date-text.csv')
headlines.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [3]:
len(headlines)

1103663

In [4]:
# sklearn skips count vectorization and uses tfidf (but in backend count vectorizer is already implemented)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])

In [6]:
dt

<1103663x95878 sparse matrix of type '<class 'numpy.float64'>'
	with 7001357 stored elements in Compressed Sparse Row format>

In [7]:
dt.data.nbytes

56010856

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])
# %%time
# cosine_similarity(dt[0:10000], dt[0:10000])

array([[0.]])

In [10]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

Wall time: 390 ms


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16913596,
        0.16792138],
       [0.        , 0.        , 0.        , ..., 0.16913596, 1.        ,
        0.33258708],
       [0.        , 0.        , 0.        , ..., 0.16792138, 0.33258708,
        1.        ]])

In [11]:
# reducing feature dimensions helps in dealing with scalable ml problems

### part of feature engineering is therefore focused on reducing these features to the ones that are really necessary.

In [12]:
# Removing stop words

In [13]:
#we pass stop words as a named parameter to the TfidfVectorizer:

In [15]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
print(len(stopwords))

326


In [16]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [17]:
# again the number of stored elements is reduced . now we have 5644186
# so memory is freed up

### Minimum frequency

In [19]:
# we can neglect words occuring less than twice (using min_df)

In [20]:
tfidf = TfidfVectorizer(stop_words=stopwords , min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

### Maximum frequency

max_df parameter, which eliminates terms occurring too often in the corpus.

In [21]:
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [22]:
# Setting max_df to a low value of 10% does not eliminate a single word!

## Linguistic Analysis

In [23]:
# Lemmatize

In [26]:
# import spacy

# nlp = spacy.load("en_core_web_sm")
nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
for i, row in headlines.iterrows():
    doc = nlp(str(row["headline_text"]))
    headlines.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    headlines.at[i, "nav"] = " ".join([token.lemma_ for token in doc
                     if token.pos_ in nouns_adjectives_verbs])

KeyboardInterrupt: 

**Using Lemmas Instead of Words for Vectorizing Documents**

this will reduce the vocabulary size !!

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt

## Limiting word types

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

### Adding Context via N-Grams

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)

## Syntactic Similarity

Find similar documents in the ABC dataset.

In [None]:
stopwords.add("test")
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2, \
                        norm='l2')
dt = tfidf.fit_transform(headlines["headline_text"])

**Finding Most Similar Headlines to a Made-up Headline**

In [None]:
made_up = tfidf.transform(["australia and new zealand discuss optimal apple \
                            size"])

# calculating cosine similaruity in the corpus

sim = cosine_similarity(made_up,dt)

# Using np.argmax gives us the index of the most similar document:

headlines.iloc[np.argmax(sim)]

##  Finding the Two Most Similar Documents in a Large Corpus (Much More Difficult)

**we just need to calculate the subdiagonal elements of the similarity matrix**