# Bengaluru News Headlines, from 2001 to 2020 : Clustering

## 1. Initial setup

In [1]:
import time

In [2]:
# Total Notebook Run: Start Time
total_time_start = time.time()

### 1.1. Importing required packages and variables

In [3]:
import os

import random

import numpy as np
import pandas as pd

import spacy

from sklearn.feature_extraction import text

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import plotly.express as px

In [4]:
# Google Colab Essentials

#from google.colab import drive
#drive.mount("/content/drive")

In [5]:
# Loading environment variables
%load_ext dotenv
%dotenv

### 1.2. Code runtime measurement

In [6]:
def compute_time_difference(time_start, time_end):
    """
    Function compute_time_difference: To compute time difference between the two provided timestamps
    @param time_start: Start time
    @param time_end: End time
    @return: time difference string
    """
    
    # Computing time difference
    time_diff = time_end - time_start
    
    # Initializing time string to store time in seconds
    time_str = str(round(time_diff, 4))+" seconds"
    
    # Checking if the seconds value amounts to more than a minute
    if time_diff > 60:
        time_str = str(round(time_diff/60, 4))+" minutes"
        
    # Returning time difference string
    return time_str

### 1.3. Loading NLP model

In [7]:
time_start = time.time()

In [8]:
# Spacy essentials

# Run below commands in the Anaconda prompt for the first time setup
#python -m spacy download en_core_web_md
#python -m spacy link en_core_web_md en_md

# Spacy English language model of medium size
nlp_en = spacy.load("en_md")
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [9]:
time_end = time.time()
print("Time taken to load the spacy model and stopwords into memory: "+compute_time_difference(time_start, time_end))

Time taken to load the spacy model and stopwords into memory: 3.0618 seconds


## 2. Data Analysis and Clustering

### 2.1. Obtaining dataframes from csv files

In [10]:
# CSV file import from local path
news_df = pd.read_csv(os.environ["FILE_PATH"])

In [11]:
# Google drive import for Google Colab
#news_df = pd.read_csv("/content/drive/MyDrive/<rel_path>")

### 2.2. Filtering subset of data and sanity

In [12]:
bng_news_df = news_df.loc[news_df["headline_category"]=="city.bengaluru"]

In [13]:
print("Data dimensions:")
bng_news_df.shape

Data dimensions:


(91857, 3)

In [14]:
print("Data types:")
bng_news_df.dtypes

Data types:


publish_date          int64
headline_category    object
headline_text        object
dtype: object

In [15]:
print("First few entries:")
bng_news_df.head()

First few entries:


Unnamed: 0,publish_date,headline_category,headline_text
274,20010104,city.bengaluru,Three in race for chief secy's post
278,20010104,city.bengaluru,He's not so inscrutable
4428,20010518,city.bengaluru,Don't take that biscuit; you dope
4429,20010518,city.bengaluru,'I've done my bit when it comes to preparing'
4436,20010518,city.bengaluru,He is etched in the chapters of Bangalore


In [16]:
print("Last few entries:")
bng_news_df.tail()

Last few entries:


Unnamed: 0,publish_date,headline_category,headline_text
3296799,20200630,city.bengaluru,Covid-19: Bengaluru's 738 fresh cases take Kar...
3296812,20200630,city.bengaluru,Now; apartments in Bengaluru offer their facil...
3296823,20200630,city.bengaluru,Karnataka: Many bristle over cap on online cla...
3297158,20200630,city.bengaluru,what bengaluru can do to tackle covid surge
3297161,20200630,city.bengaluru,karnataka may adopt keralas triple lockdown plan


In [17]:
print("Columnar statistics:")
bng_news_df.describe(include="all")

Columnar statistics:


Unnamed: 0,publish_date,headline_category,headline_text
count,91857.0,91857,91857
unique,,1,91189
top,,city.bengaluru,Kid falls from school's compound wall; dies
freq,,91857,5
mean,20109460.0,,
std,58067.29,,
min,20010100.0,,
25%,20060730.0,,
50%,20120110.0,,
75%,20160820.0,,


In [18]:
print("Checking for null values:")
bng_news_df.isnull().sum()

Checking for null values:


publish_date         0
headline_category    0
headline_text        0
dtype: int64

### 2.3. Data Clustering

### 2.3.2. Getting sentence vectors after preprocessing

In [19]:
def is_token_allowed(token):
    """
    Only allow valid tokens which are not stop words
    and punctuation symbols.
    """
    if (not token or not token.string.strip() or
        token.is_stop or token.is_punct):
        return False
    return True

def preprocess_token(token):
    """
    Reduce token to its lowercase lemma form
    """
    return token.lemma_.strip().lower()

In [20]:
# 10K Samples List: Headlines that constitutes bag of words
# Sampling is done for faster code execution
random.seed(56)
bng_news_text = random.sample(bng_news_df["headline_text"].tolist(), k=10000)

In [21]:
time_start = time.time()

In [22]:
sent_vecs = {}
for doc in nlp_en.pipe(bng_news_text,n_threads=16,batch_size=1024):
    tokens = [preprocess_token(token) for token in doc if is_token_allowed(token)]
    clean_doc = nlp_en(" ".join([x for x in tokens]))
    sent_vecs.update({doc: clean_doc.vector})

In [23]:
time_end = time.time()
print("Time taken to get all sentence vectors after preprocessing: "+compute_time_difference(time_start, time_end))

Time taken to get all sentence vectors after preprocessing: 1.0252 minutes


In [24]:
print("Headline and Embedding size samples:")
random.seed(0)
random.sample(list({(k,len(v)) for k,v in sent_vecs.items()}), k=4)

Headline and Embedding size samples:


[(Double murder in Devanahalli, 300),
 (Bangalore Utsav turns the spotlight on local artistes and art forms, 300),
 (Ex-cricketers' club on BBMP ground cleared, 300),
 (Kids all set to have fun judging movies, 300)]

In [25]:
print("Embedding array sample:")
random.seed(0)
random.sample(list(sent_vecs.items()), k=1)

Embedding array sample:


[(Certified corruption,
  array([-5.45729995e-01, -3.04504991e-01,  5.59735000e-01,  9.71225053e-02,
          4.43449974e-01, -8.58057514e-02,  2.44004011e-01, -4.06399012e-01,
         -9.47499946e-02,  2.05749989e+00,  8.76154974e-02, -3.37065011e-01,
         -9.69850048e-02, -2.96199918e-02, -7.74525046e-01, -3.78271997e-01,
          2.97470003e-01,  9.54682469e-01, -7.75599927e-02,  6.71000034e-02,
          1.79998502e-01, -2.30628997e-01,  2.45200023e-02, -7.65900016e-02,
          1.27789974e-01, -5.11240005e-01, -3.44124496e-01,  2.05085009e-01,
          2.45322496e-01, -9.05299932e-02,  9.51260030e-02,  2.50894994e-01,
         -9.43187550e-02, -4.76950034e-02, -3.16567004e-01,  3.56040001e-01,
         -3.88920486e-01,  4.23260003e-01, -9.49724540e-02, -4.92190003e-01,
          5.01099974e-02,  3.02891523e-01,  1.33179992e-01, -1.55169994e-01,
         -3.13044995e-01,  1.17324993e-01,  2.97836989e-01,  4.46539998e-01,
         -5.18599972e-02,  8.74600038e-02,  1.323480

In [None]:
sentences = list(se)

In [26]:
# Total Notebook Run: End Time
total_time_end = time.time()
print("Total time taken to run the entire notebook: "+compute_time_difference(total_time_start, total_time_end))

Total time taken to run the entire notebook: 1.16 minutes
