In [1]:
import os               # import os to walk file path
import pandas as pd     # import pandas for dataframes
import numpy as np      
import json

In [24]:
# Relative path
path_to_data = 'rssDevData/'

# Names of 5 blog categories to import 
blogs = ['DavidWalsh','DeveloperDotCom','DZone','GeeksForGeeks','SCAND','SDTimes']

# Identify key to blog text in JSON file
key = 'text'

# Initialize empty data frame
text_data = pd.DataFrame()

#Initialize counter to keep track of files that fail
files_not_read = 0

category_sizes = [] # To store number of blog articles by category
labels = [] # To store "true" labels ->  [0, 1, 2, 3, 4]
label = 0

for blog in blogs :
    current_blog_category_count = 0
    
    path_to_blogs = path_to_data + blog 
    
    for root, dir, files in os.walk(path_to_blogs) :
        # get list of only json files
        json_files = [pos_json for pos_json in files if pos_json.endswith('.json')]

        for j in json_files :
            with open(root + '/' + j, 'r') as f:
                try:
                    data = json.load(f)
                    if len(data[key]) > 200 :
                        try : # fails when there is more than one json object in the file 
                            text_data = pd.concat([text_data, pd.DataFrame(data, index=[0])], ignore_index=True) 
                            current_blog_category_count += 1 
                            labels.append(label) 
                        except : 
                            files_not_read += 1
                except:
                    print(j)
print('Files not read in: ', str(files_not_read))
print('Files read in: ', str(len(text_data)))
print(text_data.head())

5-web-design-trends-.json
locate-empty-directo.json
overview-of-spiral-s.json
13-code-quality-metr.json
can-value-stream-man.json
challenges-and-check.json
competition-of-the-m.json
connect-memphis-as-a.json
correlations-made-ea.json
creating-crap-faster.json
Files not read in:  0
Files read in:  1050
                                     header_title  \
0   Convert Fahrenheit to Celsius with JavaScript   
1     Create a Thumbnail From a Video with ffmpeg   
2                      CSS ::file-selector-button   
3                Customizing HTML Form Validation   
4  Detect Browser Bars Visibility with JavaScript   

                              date  \
0  Wed, 26 Oct 2022 10:19:49 +0000   
1  Tue, 25 Oct 2022 09:28:58 +0000   
2  Mon, 20 Feb 2023 09:50:57 +0000   
3  Mon, 09 Jan 2023 10:57:00 +0000   
4  Fri, 30 Dec 2022 01:36:35 +0000   

                                                text  \
0  The United States is one of the last bodies th...   
1  Creating a thumbnail to represent 

In [25]:
text_data.head()

Unnamed: 0,header_title,date,text,summary,url
0,Convert Fahrenheit to Celsius with JavaScript,"Wed, 26 Oct 2022 10:19:49 +0000",The United States is one of the last bodies th...,The article discusses the importance of provid...,https://davidwalsh.name/fahrenheit-celsius-jav...
1,Create a Thumbnail From a Video with ffmpeg,"Tue, 25 Oct 2022 09:28:58 +0000",Creating a thumbnail to represent a video is a...,The article discusses several topics related t...,https://davidwalsh.name/create-thumbnail-ffmpeg
2,CSS ::file-selector-button,"Mon, 20 Feb 2023 09:50:57 +0000",We all love beautifully styled form controls b...,This article discusses various web development...,https://davidwalsh.name/css-file-selector-button
3,Customizing HTML Form Validation,"Mon, 09 Jan 2023 10:57:00 +0000",Form validation has always been my least favor...,The article discusses how to control native fo...,https://davidwalsh.name/html5-validation
4,Detect Browser Bars Visibility with JavaScript,"Fri, 30 Dec 2022 01:36:35 +0000",It's one thing to know about what's in the bro...,The article discusses how developers can gain ...,https://davidwalsh.name/detect-browser-bars


## Preprocessing

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

**`max_df`**` : float in range [0.0, 1.0] or int, default=1.0`<br>
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

**`min_df`**` : float in range [0.0, 1.0] or int, default=1`<br>
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

In [27]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [28]:
dtm = cv.fit_transform(text_data['text'])

In [29]:
dtm

<1050x11620 sparse matrix of type '<class 'numpy.int64'>'
	with 250434 stored elements in Compressed Sparse Row format>

## LDA :Latent Dirichlet Allocation

In [30]:
from sklearn.decomposition import LatentDirichletAllocation

In [31]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)

In [32]:
LDA.fit(dtm)

## Showing Stored Words

In [33]:
len(cv.get_feature_names_out())

11620

In [34]:
len(LDA.components_)

7

In [35]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['management', 'technical', 'help', 'interview', 'important', 'round', 'software', 'teams', 'development', 'questions', 'project', 'time', 'product', 'work', 'team']


THE TOP 15 WORDS FOR TOPIC #1
['tools', 'cloud', 'code', 'ensure', 'process', 'user', 'access', 'data', 'development', 'tests', 'application', 'software', 'security', 'test', 'testing']


THE TOP 15 WORDS FOR TOPIC #2
['service', 'web', 'applications', 'code', 'used', 'file', 'need', 'dzone', 'aws', 'create', 'database', 'application', 'using', 'use', 'data']


THE TOP 15 WORDS FOR TOPIC #3
['information', 'consumer', 'http', 'service', 'client', 'services', 'income', 'public', 'private', 'used', 'server', 'azure', 'goods', 'key', 'message']


THE TOP 15 WORDS FOR TOPIC #4
['function', 'string', 'approach', 'time', 'elements', 'number', 'output', 'value', 'learning', 'code', 'use', 'example', 'array', 'using', 'used']


THE TOP 15 WORDS FOR TOPIC #5
['platform', 'technology', 'systems', 'lik

### Attaching Discovered Topic Labels to Original Articles

In [36]:
topic_results = LDA.transform(dtm)

In [37]:
topic_results.shape

(1050, 7)

### Combining with Original Data

In [38]:
text_data['Topic'] = topic_results.argmax(axis=1)

In [41]:
text_data.tail(20)

Unnamed: 0,header_title,date,text,summary,url,Topic
1030,Speed – and other stuff – drives the need for ...,"Wed, 22 Feb 2023 04:42:41 +0000",It started with working from home. That’s what...,,https://sdtimes.com/ai-testing/speed-and-other...,5
1031,Talend Winter ‘23 release introduces cloud mig...,"Tue, 28 Feb 2023 19:50:18 +0000",Data integration company Talend has announced ...,,https://sdtimes.com/data/talend-winter-23-rele...,5
1032,Temporal Investors Expand Funding with $75M Round,"Tue, 28 Feb 2023 20:51:35 +0000","Temporal, the leader in durable execution syst...",,https://sdtimes.com/temporal-investors-expand-...,5
1033,The first developer preview of Android 14 now ...,"Wed, 08 Feb 2023 19:26:28 +0000",The Android development team today announced t...,,https://sdtimes.com/software-development/the-f...,6
1034,The latest in generative AI: OpenAI releases A...,"Wed, 08 Mar 2023 19:11:44 +0000","ChatGPT, and other generative AIs, have contin...",,https://sdtimes.com/ai/the-latest-in-generativ...,6
1035,The Layers and Phases of Effective Digital Tra...,"Mon, 06 Feb 2023 20:18:30 +0000","Technology is a tool, not a strategy. When a c...",,https://sdtimes.com/cloud/the-layers-and-phase...,5
1036,The open source projects Grafana Phlare and Py...,"Wed, 15 Mar 2023 15:21:22 +0000","Grafana Labs, the company behind the worldâs...",,https://sdtimes.com/the-open-source-projects-g...,5
1037,The recipients of the 2022 Free Software Award...,"Mon, 20 Mar 2023 20:11:35 +0000","Eli Zaretskii, Tad (SkewedZepplin), and GNU Ja...",,https://sdtimes.com/software-development/the-r...,6
1038,Time to hide your API,"Wed, 01 Feb 2023 16:31:35 +0000",The need for robust API security is growing ra...,,https://sdtimes.com/security/time-to-hide-your...,5
1039,Tricentis extends Testim platform to mobile de...,"Fri, 17 Feb 2023 17:30:59 +0000",Tricentis is attempting to meet the growing de...,,https://sdtimes.com/test/tricentis-extends-tes...,1
