## Bag-of-Words Models

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

cv = CountVectorizer()

In [17]:
sentences = ["It was the best of times",
"it was the worst of times",
"it was the age of wisdom",
"it was the age of foolishness"]

In [18]:
more_sentences = sentences + \
["John likes to watch movies. Mary likes movies too.",
"Mary also likes to watch football games."]

In [19]:
cv.fit(more_sentences)

CountVectorizer()

In [20]:
print(cv.get_feature_names())

['age', 'also', 'best', 'foolishness', 'football', 'games', 'it', 'john', 'likes', 'mary', 'movies', 'of', 'the', 'times', 'to', 'too', 'was', 'watch', 'wisdom', 'worst']


In [21]:
dt = cv.transform(more_sentences)

In [22]:
dt

<6x20 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [23]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


### Calculating Similarity

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
word1 = [1,3,3,45,5,6,6,6]
word2 = [2,3,54,5,6,2,4,5]

cosine_similarity([word1], [word2])

array([[0.19218451]])

In [33]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.833333,0.666667,0.666667,0.0,0.0
1,0.833333,1.0,0.666667,0.666667,0.0,0.0
2,0.666667,0.666667,1.0,0.833333,0.0,0.0
3,0.666667,0.666667,0.833333,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.524142
5,0.0,0.0,0.0,0.0,0.524142,1.0


## TF-IDF Models

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer

In [35]:
tfidf = TfidfTransformer()

tfidf_dt = tfidf.fit_transform(dt)

In [39]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305609,0.501208,0.250604,0.611219,0.0,0.0,0.0,0.250604,0.305609,0.0,0.250604,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.343777,0.343777,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [40]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43076
5,0.0,0.0,0.0,0.0,0.43076,1.0


### ABC Dataset

In [43]:
from pathlib import Path

BASE_PATH = Path("../DATASETS/5")

In [46]:
headlines = pd.read_csv(BASE_PATH/"abcnews-date-text.csv.gz", 
                        parse_dates=["publish_date"])

In [48]:
len(headlines)

1103663

In [49]:
headlines.head(4)

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise


In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
tfidf = TfidfVectorizer()

dt = tfidf.fit_transform(headlines['headline_text'])

### Reducing Feature Dimensions

In [55]:
# Removing Stop words

In [56]:
from spacy.lang.en.stop_words import STOP_WORDS

In [57]:
print(len(STOP_WORDS))

326


In [58]:
tfidf = TfidfVectorizer(stop_words=STOP_WORDS)

In [60]:
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [61]:
# minimum frequency

In [62]:
tfidf = TfidfVectorizer(stop_words=STOP_WORDS, min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

In [63]:
tfidf = TfidfVectorizer(stop_words=STOP_WORDS, min_df=1e-5)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x24047 sparse matrix of type '<class 'numpy.float64'>'
	with 5457800 stored elements in Compressed Sparse Row format>

In [65]:
# Maximum frequency

In [67]:
tfidf = TfidfVectorizer(stop_words=STOP_WORDS, max_df=1e-5)
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x71553 sparse matrix of type '<class 'numpy.float64'>'
	with 186386 stored elements in Compressed Sparse Row format>

## Improving Features by Making Then More Specific

In [68]:
# Performing Linguistic analysis

In [95]:
import spacy


In [96]:
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
nouns_adjectives_verbs = ['NOUN', 'PROPN', 'ADJ', 'ADV', 'VERB']

In [None]:
for i, row in headlines.iterrows():
    doc = nlp(str(row["headline_text"]))
    headlines.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    headlines.at[i, "nav"] = " ".join([token.lemma_ for token in doc
         if token.pos_ in nouns_adjectives_verbs])

In [None]:
headlines.head(4)