In [1]:
#importing the required libraries
# for pre-processing
from pyspark.sql.functions import regexp_replace, col, row_number
from pyspark.sql.window import Window

#for time evaluation
import time #changes unit - check why

In [2]:
# for running spark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import SQLContext

In [3]:
#for TF-IDF vectorization
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
#create spark session

start = time.time()

spark = SparkSession.builder \
            .appName("Topic Modeling")\
            .config("spark.driver.extraClassPath")\
            .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/09 09:35:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# reading the JSON file
news_df = spark.read.json('News_Category_Dataset_v3.json') #reading json data

# defining the categories that need to be preserved
keep_categories = ['TECH', 'SPORTS', 'HEALTHY LIVING', 'STYLE', 'ENVIRONMENT']

                                                                                

In [6]:
#preserving the selected categories
news_df = news_df.filter((news_df.category).isin(keep_categories))

In [7]:
#extracting 2000 headlines from each category 
windowDept = Window.partitionBy("category").orderBy(col("date").desc())
df2=news_df.withColumn("row",row_number().over(windowDept))
news_df=df2.filter(col("row") <= 2000)
news_df.show()

+--------------------+-----------+----------+--------------------+--------------------+--------------------+---+
|             authors|   category|      date|            headline|                link|   short_description|row|
+--------------------+-----------+----------+--------------------+--------------------+--------------------+---+
|    Drew Costley, AP|ENVIRONMENT|2022-09-18|First Public Glob...|https://www.huffp...|On Monday, the wo...|  1|
|BECKY BOHRER, MAR...|ENVIRONMENT|2022-09-17|Alaska Prepares F...|https://www.huffp...|“In 10 years, peo...|  2|
|     DÁNICA COTO, AP|ENVIRONMENT|2022-09-17|Puerto Rico Brace...|https://www.huffp...|Puerto Rico was u...|  3|
|   Nathalie Baptiste|ENVIRONMENT|2022-09-17|Privatization Isn...|https://www.huffp...|Studies have repe...|  4|
|JULIE WATSON and ...|ENVIRONMENT|2022-09-10|Severe Winds Batt...|https://www.huffp...|After a 10-day he...|  5|
|TERRY CHEA and OL...|ENVIRONMENT|2022-09-01|Toxic Algae Cause...|https://www.huffp...|The unpre

In [8]:
news_df.show(5)

+--------------------+-----------+----------+--------------------+--------------------+--------------------+---+
|             authors|   category|      date|            headline|                link|   short_description|row|
+--------------------+-----------+----------+--------------------+--------------------+--------------------+---+
|    Drew Costley, AP|ENVIRONMENT|2022-09-18|First Public Glob...|https://www.huffp...|On Monday, the wo...|  1|
|BECKY BOHRER, MAR...|ENVIRONMENT|2022-09-17|Alaska Prepares F...|https://www.huffp...|“In 10 years, peo...|  2|
|     DÁNICA COTO, AP|ENVIRONMENT|2022-09-17|Puerto Rico Brace...|https://www.huffp...|Puerto Rico was u...|  3|
|   Nathalie Baptiste|ENVIRONMENT|2022-09-17|Privatization Isn...|https://www.huffp...|Studies have repe...|  4|
|JULIE WATSON and ...|ENVIRONMENT|2022-09-10|Severe Winds Batt...|https://www.huffp...|After a 10-day he...|  5|
+--------------------+-----------+----------+--------------------+--------------------+---------

In [9]:
# view the topic distribution
news_df.groupBy("category").count().show()

+--------------+-----+
|      category|count|
+--------------+-----+
|   ENVIRONMENT| 1444|
|HEALTHY LIVING| 2000|
|        SPORTS| 2000|
|         STYLE| 2000|
|          TECH| 2000|
+--------------+-----+



In [10]:
#selecting the headline column and creating documents dataframe
col_name = 'headline'
documents = news_df.select(col_name).distinct()

In [11]:
# checking the number of documents
print(documents.count())

9321


In [12]:
#cleaning the strings using regex
#removing the special characters
reg_exp1 = '[^A-Za-z0-9 ]'
reg1 = regexp_replace(col(col_name), reg_exp1,"")

documents1 = documents.select(col_name, \
    reg1.alias('clean_text'), \
    )

In [13]:
#removing the words that are at the end and begining of some articles and don't contribute to the content
reg_exp2 = ' PHOTOS?| VIDEOS?| WATCH| new| New| NEW'
reg2 = regexp_replace(col("clean_text"), reg_exp2,"")

documents1 = documents1.select(col_name, \
    reg2.alias("clean_text1"), \
    )


In [14]:
#time for pre-processing the data
elapsed_time1 = (time.time() - start)
print('time for pre-processing the data: {}ms'.format(elapsed_time1))

time for pre-processing the data: 7.737543106079102ms


In [15]:
#converting data to pandas to perform tf-idf
headlines = documents1.select('clean_text1').toPandas()

In [16]:
# since using tfidf, there isnt much difference in the results if stopwords are removed
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=10000,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(headlines['clean_text1'])

vocab = tf_vectorizer.vocabulary_ #vocabulary generated by CountVectorizer
#print(vocab)


In [17]:
# obtain document representation form the count vectoriser sparse matrix
docs = []
for row in tf.toarray():
    words = np.where(row != 0)[0].tolist()
    words_count = []
    for idx in words:
        for count in range(row[idx]):
            words_count.append(idx)
    docs.append(words_count) # generating 
    
# display(docs)

In [18]:
#time for pre-processing the data - TF-IDF
elapsed_time2 = (time.time() - elapsed_time1)
print('time for pre-processing the data - TF-IDF: {}ms'.format(elapsed_time2))

time for pre-processing the data - TF-IDF: 1670596517.072829ms


In [25]:
# the Latent Directlet Allocation function

# parameters - 
# docs = list of documents
# vocab = the generated vocabulary
# T = number of Topics
# m = number of Topic words
# itr_num = number of times the algorithm runs

def LDA(docs, vocab, T, m, itr_num):
    start_time = time.time()
    
    N = len(docs)        # number of documents
    V = len(vocab)  # size of the vocabulary 
    
    
    alpha = 1 / T         # Dirichlet prior for per-document topic distributions
    beta = 1 / T        #Dirichlet prior for per-topic word distribution
    
    #innitializing the matrices
    z_dn = [[0 for _ in range(len(d))] for d in docs]
    t_dz = np.zeros((N, T))        # the theta matrix
    p_zw = np.zeros((T, V))      # the phi matrix
    nd = np.zeros((N))            # document array
    nz = np.zeros((T))            # topics array


    ## Initializing the parameters
    for d, doc in enumerate(docs): #first pass over the corpus
        # n = word id for the particular document
        # w = global word id

        for n, w in enumerate(doc):
#             print(d,'----',doc)
            # assigning a topic randomly to words
            z_dn[d][n] = n % T 
            # getting the topic for word n in document d
            z = z_dn[d][n]

            # incermenting counts
            nz[z] += 1
            nd[d] += 1
            t_dz[d][z] += 1
        
            p_zw[z, w] += 1
            
    for iteration in range(itr_num):

        for d, doc in enumerate(docs):

            for n, w in enumerate(doc):
                # topic for word n in document d
                z = z_dn[d][n]

                # decrementing counts for word w with associated topic z
                p_zw[z, w] -= 1
                t_dz[d][z] -= 1
                nz[z] -= 1

                # sample new topic from according to the formula
                p_d_t = (t_dz[d] + alpha) / (nd[d] - 1 + T * alpha)
                p_t_w = (p_zw[:, w] + beta) / (nz + V * beta)
                p_z = p_d_t * p_t_w
                p_z /= np.sum(p_z)
                new_z = np.random.multinomial(1, p_z).argmax()

                # setting z as the new topic and increment counts
                z_dn[d][n] = new_z
                t_dz[d][new_z] += 1
                p_zw[new_z, w] += 1
                nz[new_z] += 1

                
    #genarating and print the topic words
    vocab_words = {value: key for key, value in vocab.items()} #vocab generated by countVectorizer is a dictionary
    
    for idx, topic in enumerate(p_zw):
        topics = "Topic #"+ str(idx) +": "
        topics += " ".join([vocab_words[i] for i in topic.argsort()[:-m - 1:-1]])
        print(topics)
        
    #time for pre-processing the data - TF-IDF
    elapsed_time = (time.time() - start_time)
    print('')
    print('Time taken to run LDA for {} topics and {} topic words for {} iterations: {}s'\
          .format(T, m, itr_num, elapsed_time))

In [26]:
LDA(docs, vocab, 3, 10, 10)

Topic #0: week apple photos iphone best fashion people 10 things day
Topic #1: health facebook climate change just list cancer says red finds
Topic #2: olympic olympics team winter gold make game rio time win

Time taken to run LDA for 3 topics and 10 topic words for 10 iterations: 13.03121304512024s


In [27]:
LDA(docs, vocab, 5, 10, 10)

Topic #0: week best photos animal day fashion things world 10 black
Topic #1: climate change just health list gold olympic wins best beauty
Topic #2: facebook trump climate like health world james change just fight
Topic #3: nfl know heres help player red olympic star want style
Topic #4: apple iphone people just olympics instagram 10 says week twitter

Time taken to run LDA for 5 topics and 10 topic words for 10 iterations: 12.929238080978394s


In [28]:
LDA(docs, vocab, 5, 5, 10)

Topic #0: apple week photos million says
Topic #1: climate change week apple iphone
Topic #2: just best beauty fashion list
Topic #3: health people facebook make women
Topic #4: olympics olympic team rio gold

Time taken to run LDA for 5 topics and 5 topic words for 10 iterations: 12.982478141784668s


In [29]:
LDA(docs, vocab, 10, 5, 10)

Topic #0: week red carpet 11 fashion
Topic #1: change climate day best heres
Topic #2: climate make beauty change big
Topic #3: hair dont health people cancer
Topic #4: just health people facebook mental
Topic #5: apple iphone olympics facebook week
Topic #6: photos olympic week animal nfl
Topic #7: olympic team gold world zika
Topic #8: game world says nba water
Topic #9: world video twitter dead trumps

Time taken to run LDA for 10 topics and 5 topic words for 10 iterations: 12.936904191970825s


In [30]:
LDA(docs, vocab, 3, 10, 50)

Topic #0: week best fashion like photos day make look beauty things
Topic #1: health olympic climate olympics change team gold says cancer rio
Topic #2: apple facebook google iphone game twitter nfl just trump nba

Time taken to run LDA for 3 topics and 10 topic words for 50 iterations: 64.3887619972229s


In [31]:
LDA(docs, vocab, 5, 10, 50)

Topic #0: best week fashion photos beauty style looks dress like red
Topic #1: apple climate iphone change facebook says google super week nfl
Topic #2: olympic olympics team gold rio win winter wins just nba
Topic #3: health people dont trump care know need things life cancer
Topic #4: world day video game google national future nfl major hurricane

Time taken to run LDA for 5 topics and 10 topic words for 50 iterations: 65.23653292655945s


In [32]:
LDA(docs, vocab, 3, 10, 5)

Topic #0: week photos 10 best like beauty just things animal looks
Topic #1: apple facebook climate iphone just change olympic heres says team
Topic #2: fashion google olympics olympic just people time health world best

Time taken to run LDA for 3 topics and 10 topic words for 5 iterations: 6.5506370067596436s


In [33]:
LDA(docs, vocab, 5, 10, 5)

Topic #0: 10 best apple beauty facebook week care health just day
Topic #1: apple week iphone climate just change cancer best photos olympic
Topic #2: world team fashion olympic week trump says people like google
Topic #3: facebook says just like make look time twitter trump hair
Topic #4: olympics people watch winter player health tech williams nfl online

Time taken to run LDA for 5 topics and 10 topic words for 5 iterations: 6.560553073883057s
