In [1]:
import string
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
url = 'https://raw.githubusercontent.com/lucascheng24/COMP4432ML-DataProduct-A_Million_News_Headlines/main/raw_data/abcnews-date-text.csv'


df = pd.read_csv(url)

headlines = df['headline_text']

In [3]:
# Sample size
sampleSize = len(headlines) // 20   # 5%
analyze_random_state = 4432

print("sampleSize: ", sampleSize)

sampleSize:  62209


In [4]:
df = df.sample(n = sampleSize, random_state = analyze_random_state)

In [6]:
df.head()

Unnamed: 0,publish_date,headline_text
157663,20050415,govt urged to release sustainability grants
53129,20031104,whatmore names side for one dayers
946324,20150422,driverless cars adelaide trials closer accordi...
1062810,20161216,star wars quiz: test your knowledge of a galax...
210891,20060106,student success ascribed to support network


In [7]:
df = df.drop_duplicates('headline_text')
headlines = df['headline_text']

In [8]:
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return tokens

preprocessed_headlines = [preprocess(headline) for headline in headlines]

# Convert preprocessed_headlines to a NumPy array
preprocessed_headlines_np = np.array(preprocessed_headlines)

  preprocessed_headlines_np = np.array(preprocessed_headlines)


In [15]:
preprocessed_df = df
preprocessed_df['headline_text'] = preprocessed_headlines_np

In [9]:
stem = PorterStemmer()
def stemmer(txt_arr):
    return [stem.stem(w) for w in txt_arr]

lem = WordNetLemmatizer()
def lemma(txt_arr):
    return [lem.lemmatize(w) for w in txt_arr]

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

In [24]:
preprocessed_df

Unnamed: 0,publish_date,headline_text
157663,20050415,"[govt, urged, release, sustainability, grant]"
53129,20031104,"[whatmore, name, side, one, dayers]"
946324,20150422,"[driverless, car, adelaide, trial, closer, acc..."
1062810,20161216,"[star, war, quiz, test, knowledge, galaxy, far..."
210891,20060106,"[student, success, ascribed, support, network]"
...,...,...
1082965,20170523,"[former, world, champion, nicky, hayden, dy]"
465365,20090528,"[swine, flu, ship, quarantined]"
987189,20151021,"[disgraced, former, png, police, chief, geoffr..."
871078,20140501,"[diesel, rebate, cut, fear]"


In [22]:
preprocessed_df['headline_text'] = df['headline_text'].apply(lemma)
preprocessed_df['pos_tags'] = preprocessed_df['headline_text'].apply(lambda x: nltk.pos_tag(x))

In [None]:
# preprocessed_df['headline_text'].apply(nltk.pos_tag)

In [29]:
preprocessed_df.head()

Unnamed: 0,publish_date,headline_text,pos_tags
157663,20050415,"[govt, urged, release, sustainability, grant]","[(govt, NN), (urged, VBD), (release, NN), (sus..."
53129,20031104,"[whatmore, name, side, one, dayers]","[(whatmore, NN), (name, NN), (side, NN), (one,..."
946324,20150422,"[driverless, car, adelaide, trial, closer, acc...","[(driverless, NN), (car, NN), (adelaide, IN), ..."
1062810,20161216,"[star, war, quiz, test, knowledge, galaxy, far...","[(star, JJ), (war, NN), (quiz, JJ), (test, NN)..."
210891,20060106,"[student, success, ascribed, support, network]","[(student, NN), (success, NN), (ascribed, VBD)..."


Feature Extraction/Vectorization

In [32]:
# create an instance of TfidfVectorizer with tf-idf;
tf = TfidfVectorizer(stop_words = 'english', use_idf = True, norm = 'l2', min_df=2, max_df=0.3)

# convert text to features
text_tokens_tfidf = tf.fit_transform([' '.join(x) for x in preprocessed_df['headline_text']])

# get feature names
feature_names = tf.get_feature_names_out()

# print the first several examples and the features
feature_matrix = pd.DataFrame(text_tokens_tfidf.toarray()[:10], columns = feature_names)

feature_matrix

Unnamed: 0,000,01,02,03,06,07,0702,09,10,100,...,zoe,zombie,zone,zoning,zoo,zookeeper,zoom,zuckerberg,zuma,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


K Means Clustering

In [33]:
wcss = []
for i in range(1,21):
    kmeans = KMeans(n_clusters=i,random_state=0)
    kmeans.fit(text_tokens_tfidf)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,21),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
#plt.savefig('elbow.png')
plt.show()

KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_sparse'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 2, in where
KeyboardInterrupt: 


Clustering Hierarchical

In [None]:
kMean = KMeans(n_clusters=10,)
kMean.fit(text_tokens_tfidf)

print("Top terms per cluster:")
common = kMean.cluster_centers_.argsort()[:, ::-1]
print(common)
terms = tf.get_feature_names_out()
for i in range(10):
    print("Cluster:",i),
    for ind in common[i, :50]:
        print(terms[ind])
    print('\n')