In [476]:
import pandas as pd
import numpy as np

import os, json

## Read rss data

In [477]:
# Read Json files
path_to_json = 'rssData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#print(json_files)

In [478]:
# Go through Json files 
base_dir = 'rssData'

#Get all files in the directory

data_list = []
for file in os.listdir(base_dir):

    #If file is a json, construct it's full path and open it, append all json data to list
    if 'json' in file:
        json_path = os.path.join(base_dir, file)
        json_data = pd.read_json(json_path, lines=True)
        data_list.append(json_data)

#print(data_list)

In [479]:
len(data_list)

198

In [480]:
data_list[0].text

0    
Name: text, dtype: object

In [481]:
data_list[0].text.values

array([''], dtype=object)

In [482]:
# Get text data and remove empty texts
all_text=[]
for i in range(len(data_list)):
    if (data_list[i].text.values!=''):
        text=list(data_list[i].text)
#         print(type(text))
        all_text.append(text)
        #print(data_list[i].text)
        #print(text)
#print(all_text)

In [483]:
len(all_text)

147

In [484]:
# all_text_df['lenght'] = all_text_df.apply(lambda row:len(row),axis=1)

In [485]:
# all_text_df.head()

## Implement NLP techniques

In [486]:
import spacy

In [487]:
nlp = spacy.load('en_core_web_sm')

### Preprocessing and cleaning

In [488]:
# function to clean data
#lower case
#remove stop words
#lemmatization

def cleanData(doc):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [489]:
# all_text[0][0]

In [490]:
doc1= all_text[0][0]
len(doc1)
# print(doc1)

10348

In [491]:
cleaned_doc1 =cleanData(doc1)
len(cleaned_doc1)

6580

In [492]:
all_text_df= pd.DataFrame(all_text, columns=['text'])
all_text_df.head()

Unnamed: 0,text
0,"The 911 service as it existed until July 28, 2..."
1,"DanielMiessler Created/Updated: July 25, 2022 ..."
2,The 911 service as it exists today.For the pas...
3,DanielMiesslerMy first thought on the whole di...
4,"DanielMiesslerWell, our congressional heroes f..."


In [493]:
cl = cleanData(all_text_df['text'][0])
len(cl)

6580

In [494]:
all_text_df['clean'] = all_text_df.apply(lambda row:cleanData (row['text']),axis=1)

In [495]:
all_text_df.head()

Unnamed: 0,text,clean
0,"The 911 service as it existed until July 28, 2...",911 service exist july 28 2022.911[.]re proxy ...
1,"DanielMiessler Created/Updated: July 25, 2022 ...",danielmiessler create update july 25 2022 read...
2,The 911 service as it exists today.For the pas...,911 service exist today.for past seven year on...
3,DanielMiesslerMy first thought on the whole di...,danielmiesslermy think discussion sure musk ar...
4,"DanielMiesslerWell, our congressional heroes f...",danielmiesslerwell congressional hero finally ...


In [496]:
all_text_df['clean'][1]

'danielmiessler create update july 25 2022 reading term concept bourgeoisie come afford light grasp learn mis learn forget relearn definition time force create page remind myself.here summary language town like word start like burg”.what gpt-3 think modern bourgeois party look likehere openai gpt-3 think bourgeoisie mean term bourgeoisie refer social class consider elite society typically high level education hold position power influence characterize wealth luxurious lifestyle.gpt-3so weight elite.the word bourgeoisie describe middle upper class society people typically educate high pay job enjoy comfortable lifestyle material possessions.gpt-3this say middle upper.the word bourgeoisie describe middle class society typically country significant difference wealthy poor class bourgeois class typically see wealthy wealthy aristocracy consider educated cultured work class.gpt-3and say middle mention educated cultured workers.these gpt-3 summary interesting capture conflicting messaging ba

In [497]:
#function to Tokenization, POS, NER
# def TAG(clean):
#     token_list=[]
#     pos_list=[]
#     ner_list=[]
#     clean = nlp(clean)
#     for token in clean:
#         token_list.append(token)
#         pos_list.append(token.pos_)
#         for ent in clean.ents:
#             ner_list.append(ent.label_)
        
#     return(token_list, pos_list, ner_list)      

### Tokenization

In [498]:
#function to Tokenization
def Token(clean):
    token_list=[]
    clean = nlp(clean)
    for token in clean:
        token_list.append(token)
    return(token_list)      

In [499]:
# Token(all_text_df['clean'][0])

In [500]:
all_text_df['token'] = all_text_df.apply(lambda row:Token (row['clean']),axis=1)

In [501]:
all_text_df.head()

Unnamed: 0,text,clean,token
0,"The 911 service as it existed until July 28, 2...",911 service exist july 28 2022.911[.]re proxy ...,"[911, service, exist, july, 28, 2022.911[.]re,..."
1,"DanielMiessler Created/Updated: July 25, 2022 ...",danielmiessler create update july 25 2022 read...,"[danielmiessler, create, update, july, 25, 202..."
2,The 911 service as it exists today.For the pas...,911 service exist today.for past seven year on...,"[911, service, exist, today.for, past, seven, ..."
3,DanielMiesslerMy first thought on the whole di...,danielmiesslermy think discussion sure musk ar...,"[danielmiesslermy, think, discussion, sure, mu..."
4,"DanielMiesslerWell, our congressional heroes f...",danielmiesslerwell congressional hero finally ...,"[danielmiesslerwell, congressional, hero, fina..."
