In [45]:
# To store data
import pandas as pd

# To do linear algebra
import numpy as np

# To create models
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora

# To search directories
import os

# To use regex
import re

# To get punctuation
import string

# To parse html
from bs4 import BeautifulSoup

# To get progression bars
from tqdm import tqdm

# To measure time
from time import time

# To get simple counters
from collections import Counter

# To process natural language
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

import nltk

# To use sparse matrices
from scipy.sparse import csr_matrix

# To create plots
import matplotlib.pyplot as plt

In [46]:
curr_dir = '/home/nick/Documents/NYT/'
all_headlines = []
for filename in os.listdir('/home/nick/Documents/NYT/'):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

1214

In [47]:
# Path to the data
path = '/home/nick/Documents/NYT/'

# Create file lists
files_comments = [os.path.join(path, file) for file in os.listdir(path) if file.startswith('C')]
files_articles = [os.path.join(path, file) for file in os.listdir(path) if file.startswith('A')]

# Load data
comments = []
for file in files_comments[:1]:
    comments.extend(pd.read_csv(file, low_memory=False)['commentBody'].dropna().values)
    
print('Loaded Comments: {}'.format(len(comments)))

Loaded Comments: 243832


In [48]:
nltk.download()

# Number of comments to use in the LDA
n = 5000

# To remove punctuation
re_punctuation = re.compile('['+string.punctuation+']')

# To tokenize the comments
tokenizer = RegexpTokenizer('\w+')

# Get stopwords
stop = stopwords.words('english')


# Iterate over all comments
preprocessed_comments = []
for comment in tqdm(np.random.choice(comments, n)):
    # Remove html
    comment = BeautifulSoup(comment, 'lxml').get_text().lower()
    
    # Remove punctuation
    comment = re_punctuation.sub(' ', comment)
    
    # Tokenize comments
    comment = tokenizer.tokenize(comment)
    
    # Remove stopwords
    comment = [word for word in comment if word not in stop]
    preprocessed_comments.append(comment)
    
    
# Count overall word frequency
wordFrequency = Counter()
for comment in preprocessed_comments:
    wordFrequency.update(comment)
print('Unique Words In Comments: {}'.format(len(wordFrequency)))


# Remove rare words
minimumWordOccurrences = 5
texts = [[word for word in comment if wordFrequency[word] > minimumWordOccurrences] for comment in preprocessed_comments]


# Create word dictionary
dictionary = corpora.Dictionary(texts)
vocabulary = [dictionary[i] for i in dictionary.keys()]
print('Documents/Comments: {}'.format(len(texts)))


# Create corpus
corpus = [dictionary.doc2bow(doc) for doc in texts]


# Create sparse matrix
def makesparse(mycorpus, ncolumns):
    data, row, col = [], [], []
    for cc, doc in enumerate(mycorpus):
        for word in doc:
            row.append(cc)
            col.append(word[0])
            data.append(word[1])
    X = csr_matrix((np.array(data), (np.array(row), np.array(col))), shape=(cc+1, ncolumns))
    return X


# Create sparse matrix
X = makesparse(corpus, len(dictionary))
print('Train Shape:\t{}'.format(X.shape))

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


100%|██████████| 5000/5000 [00:01<00:00, 3667.35it/s]
2020-08-08 23:07:22,522 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-08-08 23:07:22,640 : INFO : built Dictionary(5101 unique tokens: ['10', '100', '3', '5', 'aging']...) from 5000 documents (total 158222 corpus positions)


Unique Words In Comments: 22035
Documents/Comments: 5000
Train Shape:	(5000, 5101)


\* Note: I got this code from https://www.kaggle.com/morrisb/compare-lda-topic-modeling-in-sklearn-and-gensim