## Objective: Use Latent Dirichlet Allocation to identify common themes in articles nominated for deletion
* Following along with a tutorial: https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html

In [1]:
from bs4 import BeautifulSoup
import boto3
import config as cfg
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import Phrases
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from pprint import pprint
import yaml

In [2]:
%run "../libraries/aws_utils.ipynb"

In [3]:
%run "../libraries/general_utils.ipynb"

In [4]:
with open('../data_engineering/config.yml', 'r') as file:
   config_files = yaml.safe_load(file)

In [5]:
NUM_TOPICS = 5 # number of topics to fit on LDA model

## Load wiki articles

In [6]:
s3_reader = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

In [7]:
article_to_afd_join_key = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['JOINED_ARTICLE_SCRAPE_DATES_AND_AFD_NAMES'],
                                          )
test_primary_key(article_to_afd_join_key, ['article_id', 'file_name'])

In [8]:
pronoun_data = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['INFERRED_GENDER_BY_PRONOUN_COUNT'],
                                          )
test_primary_key(pronoun_data, ['article_id', 'file_name'])

In [9]:
original_rows = pronoun_data.shape[0]
pronoun_data = pronoun_data.merge(article_to_afd_join_key[['article_id', 'file_name', 'afd_result', 'discussion']],
                                on = ['article_id', 'file_name'])
assert original_rows == pronoun_data.shape[0]

In [10]:
article_text = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['SCRAPED_ARTICLE_TEXT_AND_REFERENCE_TEXT'],
                                          )
test_primary_key(article_text, ['article_id', 'file_name'])

In [11]:
original_rows = pronoun_data.shape[0]
pronoun_data = pronoun_data.merge(article_text[['article_id', 'file_name', 'articles_text']],
                                on = ['article_id', 'file_name'])
assert original_rows == pronoun_data.shape[0]

In [12]:
pronoun_data[0:3]

Unnamed: 0,article_id,file_name,scraped_path,num_male_tokens,num_female_tokens,num_non_binary_tokens,num_neo_tokens,max_pronoun_column,afd_result,discussion,articles_text
0,A.S.D._Villabiagio,daily_afd_log/2023-01-03/2022_December_23.txt,individual_afd_page_html/2023-01-01/A.S.D._Vil...,1,0,0,0,male,keep,"<div class=""boilerplate afd vfd xfd-closed arc...","<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
1,Aaron_Kemmer,daily_afd_log/2023-01-19/2023_January_8.txt,individual_afd_page_html/2023-01-01/Aaron_Kemm...,5,0,2,0,male,delete,"<div class=""boilerplate afd vfd xfd-closed arc...","<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
2,Abbas_Sajwani,daily_afd_log/2023-01-07/2022_December_27.txt,individual_afd_page_html/2023-01-01/Abbas_Sajw...,1,0,0,0,male,delete,"<div class=""boilerplate afd vfd xfd-closed arc...","<!DOCTYPE html>\n<html class=""client-nojs"" lan..."


## Pre-process Wikipedia articles
* extract text from HTML
* remove standard Wikipedia banner messages that are not actual article text

In [13]:
pronoun_data['article_soup'] = pronoun_data['articles_text'].apply(lambda x: BeautifulSoup(x, "html.parser"))

### Extract text

In [14]:
pronoun_data['article_body'] = pronoun_data['article_soup'].apply(lambda x: x.find_all('div', 
                                                                                       class_='mw-body-content'))
pronoun_data['article_body_text'] = pronoun_data['article_body'].apply(lambda x: x[0].get_text(separator=' '))

### Remove standard Wikipedia banner messages

In [15]:
def get_afd_warning_element(soup):
    '''
    Retrieves Articles for Deletion (AFD) warning elements from a BeautifulSoup object.

    Parameters:
        soup (BeautifulSoup): The BeautifulSoup object representing the HTML page.

    Returns:
        list: A list of AFD warning elements found in the HTML page. Each element is a string.

    Description:
        This method searches for AFD warning elements within the given BeautifulSoup object. 
        An AFD warning element indicates that the article is being considered for deletion. 
        The method looks for <div> elements with the class
        'mbox-text-span' and checks if they contain the specific text 'This article is being considered for deletion'.
        If a match is found, the warning element is added to the list of found_warnings.

        Note:
            The returned warning elements may contain additional HTML tags and formatting.

    '''
    found_warnings = []
    possible_afd_warnings = soup.find_all('div', class_ = 'mbox-text-span')
    if len(possible_afd_warnings) > 0:
        for possible_afd_warning in possible_afd_warnings:
            if 'This article is being considered for deletion' in possible_afd_warning.text:
                found_warnings = found_warnings + [possible_afd_warning.get_text(separator=' ')]
    return found_warnings
    
def get_notability_warning_element(soup):
    '''
    Retrieves notability warning elements from a BeautifulSoup object.

    Parameters:
        soup (BeautifulSoup): The BeautifulSoup object representing the HTML page.

    Returns:
        list: A list of notability warning elements found in the HTML page. Each element is a string.

    Description:
        This method searches for notability warning elements within the given BeautifulSoup object. 
        It iterates over a list of classes and a list of specific warning texts. 
        For each class, it finds <div> elements with that class
        and checks if they contain any of the specified warning texts. 
        If a match is found, the warning element is added
        to the list of found_warnings.

        Note:
            The returned warning elements may contain additional HTML tags and formatting.

    '''
    classes = ['multiple-issues-text', 'mbox-text-span']
    found_warnings = []
    
    for this_class in classes:
        possible_warnings = soup.find_all('div', class_ = this_class)
        warnings = ['This article has multiple issues',
                    'deletion policy', 'notability guideline', 'nominated for deletion', 
                    'You can help Wikipedia by expanding it',
                   'This article does not cite any sources',
                   'improve this article',
                   'needs additional citations',
                   'Please help improve',
                   'link rot', 
                   'no other articles link to it',
                   'The neutrality of this article is disputed']
        for warning in warnings:
            if len(possible_warnings) > 0:
                for possible_warning in possible_warnings:
                    if warning.lower() in possible_warning.text.lower():
                        found_warnings = found_warnings + [possible_warning.get_text(separator=' ')]
    return found_warnings

In [16]:
pronoun_data['afd_warning_element'] = pronoun_data['article_soup'].apply(lambda x: get_afd_warning_element(x) )
pronoun_data['notability_warning_element'] = pronoun_data['article_soup'].apply(lambda x: get_notability_warning_element(x) )


In [17]:
pronoun_data['afd_warning_element'].value_counts().reset_index()[0:5]

Unnamed: 0,index,afd_warning_element
0,[],105
1,[This article is being considered for deletion...,2
2,[This article is being considered for deletion...,2
3,[This article is being considered for deletion...,2
4,[This article is being considered for deletion...,2


In [18]:
pronoun_data['notability_warning_element'].value_counts().reset_index()[0:5]

Unnamed: 0,index,notability_warning_element
0,[],77
1,[This article is being considered for deletion...,2
2,[The topic of this article may not meet Wikip...,2
3,[This article is being considered for deletion...,2
4,[This article is being considered for deletion...,2


In [19]:
def remove_warning(body_text, warning_text):
    '''
    Removes warning texts from the body text.

    Parameters:
        body_text (str): The original body text.
        warning_text (list): A list of warning texts to be removed from the body text.

    Returns:
        str: The modified body text with the specified warning texts removed.

    Description:
        This method removes specific warning texts from the given body text. It iterates over each warning text in the
        provided list and uses the `replace()` method to remove each occurrence of the warning text from the body text.
        The modified body text is then returned.

    Example:
        # Original body text
        body_text = "This article has multiple issues. Please help improve it. KEEP"

        # Warning texts to be removed
        warning_texts = ["This article has multiple issues", "Please help improve it."]

        # Remove warning texts
        modified_body_text = remove_warning(body_text, warning_texts)

        # Print the modified body text
        print(modified_body_text)
        # Output: " KEEP"
    '''
    try:
        for this_warning_text in warning_text:
            body_text = body_text.replace(this_warning_text, "")
        return body_text
    except:
        return body_text

In [20]:
pronoun_data['article_body_text_wo_warning'] = pronoun_data[['article_body_text', 'afd_warning_element']].apply(lambda x: remove_warning(x[0], x[1]), axis=1)

In [21]:
pronoun_data['article_body_text_wo_warning'] = pronoun_data[['article_body_text_wo_warning', 'notability_warning_element']].apply(lambda x: 
                                                                                                                remove_warning(x[0], x[1]), axis=1)

In [22]:
def remove_common_warning_messages(text_to_clean):
    '''
    Removes warning texts from the body text.

    Parameters:
        text_to_clean (str): The text to clean

    Returns:
        str: The modified body text with the specified warning texts removed.

    Description:
        This method removes specific warning texts from the given body text. It iterates over each warning text in the
        provided list and uses the `replace()` method to remove each occurrence of the warning text from the body text.
        The modified body text is then returned.
    '''
    common_warning_text = ['Learn how and when to remove this template message', 
                          'citation needed']
    for substring in common_warning_text:
        text_to_clean = text_to_clean.replace(substring, ' ')
    return text_to_clean

In [23]:
pronoun_data['article_body_text_wo_common_warning'] = pronoun_data['article_body_text_wo_warning'].apply(lambda x: remove_common_warning_messages(x))



In [24]:
pronoun_data['article_body_text_wo_warning_remove_edit'] = pronoun_data['article_body_text_wo_common_warning'].apply(
    lambda x: x.replace("[edit]","").replace("\n"," "))

## Pre-process and vectorize the documents
Among other things, we will:

Split the documents into tokens.
Lemmatize the tokens.
Compute bigrams.
Compute a bag-of-words representation of the data.
First we tokenize the text using a regular expression tokenizer from NLTK. We remove numeric tokens and tokens that are only a single character, as they don't tend to be useful, and the dataset contains a lot of them.

In [25]:
female_docs = pronoun_data[pronoun_data['max_pronoun_column']=="female"]['article_body_text_wo_warning_remove_edit'].values
male_docs = pronoun_data[pronoun_data['max_pronoun_column']=="male"]['article_body_text_wo_warning_remove_edit'].values

In [26]:
def tokenize(docs):
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    return docs

female_docs = tokenize(female_docs)
male_docs = tokenize(male_docs)

In [27]:
female_docs[0][0:10]

['american',
 'sports',
 'journalist',
 'born',
 'abby',
 'chin',
 'born',
 'is',
 'an',
 'american']

## Lemmatize

In [28]:
# Lemmatize the documents.

def lemmatize(docs):
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    return docs


female_docs = lemmatize(female_docs)
male_docs = lemmatize(male_docs)

In [29]:
female_docs[0][-10:]

['covering', 'the', 'nba', 'she', 'is', 'married', 'and', 'ha', 'two', 'child']

## Bigrams

In [30]:
def bigramize(docs):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=5)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
    return docs
                
female_docs = bigramize(female_docs)
male_docs = bigramize(male_docs)

In [31]:
female_docs[0][-20:]

['and',
 'television',
 'before',
 'becoming',
 'sideline',
 'reporter',
 'covering',
 'the',
 'nba',
 'she',
 'is',
 'married',
 'and',
 'ha',
 'two',
 'child',
 'is_an',
 'a_well',
 'she_studied',
 'is_married']

## Remove stop words

In [32]:
# Remove rare and common tokens.
def create_dictionary(docs):
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=5, no_above=0.25)
    return dictionary

female_dict = create_dictionary(female_docs)
male_dict = create_dictionary(male_docs)

In [33]:
def create_corpus(dictionary, docs):
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    return corpus

female_corpus = create_corpus(female_dict, female_docs)
male_corpus = create_corpus(male_dict, male_docs)

In [34]:
print('Number of unique tokens: %d' % len(female_dict))
print('Number of documents: %d' % len(female_corpus))

Number of unique tokens: 825
Number of documents: 113


In [35]:
print('Number of unique tokens: %d' % len(male_dict))
print('Number of documents: %d' % len(male_corpus))

Number of unique tokens: 2880
Number of documents: 321


In [36]:
def train_model(dictionary, corpus, NUM_TOPICS):
    
    # Train LDA model.
    # Set training parameters.
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                           alpha='auto', eta='auto', \
                           iterations=iterations, num_topics=NUM_TOPICS, \
                           passes=passes, eval_every=eval_every)
    
    return model

In [38]:
female_model = train_model(female_dict, female_corpus, NUM_TOPICS)
male_model = train_model(male_dict, male_corpus, NUM_TOPICS)

In [39]:
def get_topics(model, corpus, NUM_TOPICS):
    top_topics = model.top_topics(corpus, topn=10)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / NUM_TOPICS
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    pprint(top_topics)
    return top_topics

In [41]:
male_topic_inference = get_topics(male_model, male_corpus, NUM_TOPICS)

Average topic coherence: -1.4621.
[([(0.016238948, 'resident'),
   (0.013869418, 'her'),
   (0.010968405, 'game'),
   (0.010883464, 'him'),
   (0.008019179, 'she'),
   (0.0069982074, 'their'),
   (0.006972156, 'character'),
   (0.006567767, 'voiced'),
   (0.006071224, 'into'),
   (0.0059863324, 'but')],
  -1.0996710920521855),
 ([(0.012814283, 'film'),
   (0.012085512, 'music'),
   (0.009329081, 'song'),
   (0.008887604, 'released'),
   (0.008577433, 'album'),
   (0.008043028, 'producer'),
   (0.006873356, 'band'),
   (0.0066807517, 'artist'),
   (0.0058935923, 'actor'),
   (0.005799477, 'television')],
  -1.1273823222256376),
 ([(0.02411203, 'goal'),
   (0.017787322, 'club'),
   (0.015033065, 'league'),
   (0.011836219, 'player'),
   (0.010548769, 'against'),
   (0.009900373, 'win'),
   (0.009179929, 'season'),
   (0.008526809, 'cup'),
   (0.008425416, 'match'),
   (0.008239074, 'scored')],
  -1.3628980823800323),
 ([(0.0048934757, 'book'),
   (0.0048593073, 'director'),
   (0.0044888

In [55]:
female_topic_inference = get_topics(female_model, female_corpus, NUM_TOPICS)

Average topic coherence: -3.3244.
[([(0.04718732, 'day'),
   (0.037078783, 'week'),
   (0.027459877, 'house'),
   (0.020416556, 'not'),
   (0.02030535, 'were'),
   (0.016610358, 'nominated'),
   (0.015976118, 'nomination'),
   (0.013755698, 'no'),
   (0.01183838, 'news'),
   (0.010747429, 'will')],
  -1.4519191974010985),
 ([(0.013237526, 'production'),
   (0.013074175, 'star'),
   (0.008824809, 'music'),
   (0.008807724, 'episode'),
   (0.008431958, 'law'),
   (0.008051529, 'album'),
   (0.007973641, 'title'),
   (0.007892805, 'role'),
   (0.0073152785, 'singer'),
   (0.00694186, 'story')],
  -2.3011032217963416),
 ([(0.018396307, 'award'),
   (0.016261438, 'national'),
   (0.014673189, 'international'),
   (0.013882184, 'best'),
   (0.012874805, 'won'),
   (0.011157363, 'team'),
   (0.0077791447, 'club'),
   (0.0072798817, 'taylor'),
   (0.007257951, 'girl'),
   (0.00681258, 'goal')],
  -2.748130012159997),
 ([(0.021956863, 'award'),
   (0.021349683, 'digital'),
   (0.021242952, 'sin

## Classify documents

In [43]:
female_topics = [female_model.get_document_topics(doc) for doc in female_corpus]

In [49]:
# For each document, get the topic index with the maximum probability
most_likely_topic = [max(doc, key=lambda x: x[1])[0] for doc in female_topics]

In [53]:
def get_doc_indices(topic_index, most_likely_topic_index_list):
    """
    Retrieves the indices of elements equal to a specified topic index.

    Parameters:
        topic_index (int): The topic index to search for.
        most_likely_topic_index_list (list): A list of topic indices.

    Returns:
        list: A list of indices where the elements in most_likely_topic_index_list are equal to the specified topic_index.

    Description:
        This function takes a topic_index and a list of topic indices (most_likely_topic_index_list) as input. It searches
        through the most_likely_topic_index_list and returns a list of indices where the elements are equal to the specified
        topic_index.

    Example:
        # Define a list of topic indices
        topic_indices = [1, 3, 2, 3, 1, 3]

        # Get the indices of elements equal to 3
        indices = get_doc_indices(3, topic_indices)

        # Print the indices
        print(indices)
        # Output: [1, 3, 5]

    Note:
        The returned indices represent the positions of the elements in the most_likely_topic_index_list where the values
        are equal to the specified topic_index. The indices are 0-based.
    """
    indices = [index for index, value in enumerate(most_likely_topic_index_list) if value == topic_index]
    return indices


In [60]:
def print_matching_words(topic_inference, docs, topic_doc_indices):
    """
    Print the matching words between docs and words from topic_inference for the documents
    specified by topic_doc_indices.

    Parameters:
        topic_inference (list): List containing topic inference data.
        docs (list): List of documents.
        topic_doc_indices (list): List of indices representing the documents of interest.

    Returns:
        None

    Description:
        This function prints the docs that correspond to the topic_doc_indices. It then checks for the presence
        of matching words between the docs and words from topic_inference for each document. If a matching
        word is found, it is printed along with the statement 'FOUND'. The function adds a separator '*********' 
        between each document's output for clarity.

    Example:
        # Obtain topic_inference, docs, and topic_doc_indices

        # Call the function
        print_matching_words(topic_inference, docs, topic_doc_indices)

    Note:
        - The function assumes that the elements of topic_inference are in the format: (index, word).
        - docs and topic_inference should be appropriately populated prior to calling this function.
    """
    words = [word for _, word in topic_inference[0]]

    for doc in topic_doc_indices:
        for word in words:
            if word in docs[doc]:
                print(f'  FOUND {word}')
        print(docs[doc])
        print('*********')


### Review example output of topic 0

In [61]:
topic_0_doc_indices = get_doc_indices(0, most_likely_topic)

In [62]:
female_topic_inference[0]

([(0.04718732, 'day'),
  (0.037078783, 'week'),
  (0.027459877, 'house'),
  (0.020416556, 'not'),
  (0.02030535, 'were'),
  (0.016610358, 'nominated'),
  (0.015976118, 'nomination'),
  (0.013755698, 'no'),
  (0.01183838, 'news'),
  (0.010747429, 'will')],
 -1.4519191974010985)

In [63]:
print_matching_words(female_topic_inference[0], female_docs, topic_0_doc_indices)

['aiyshwarya', 'mahadev', 'secretary', 'all', 'india', 'mahila', 'congress', 'incumbent', 'assumed', 'office', 'march', 'aicc', 'spokesperson', 'in', 'office', 'january', 'march', 'personal', 'detail', 'born', 'december', 'age', 'bangalore', 'political', 'party', 'indian', 'national', 'congress', 'spouse', 'karthik', 'parent', 'manchanahalli', 'mahadev', 'and', 'anuradha', 'residence', 'krishnarajanagara', 'alma', 'mater', 'christ', 'university', 'bangalore', 'occupation', 'politician', 'lawyer', 'aiyshwarya', 'mahadev', 'is', 'politician', 'from', 'karnataka', 'state', 'she', 'is', 'member', 'of', 'indian', 'national', 'congress', 'now', 'she', 'is', 'working', 'a', 'secretary', 'of', 'all', 'india', 'mahila', 'congress', 'content', 'personal', 'life', 'political', 'life', 'reference', 'external', 'link', 'personal', 'life', 'edit', 'she', 'is', 'the', 'daughter', 'of', 'former', 'mla', 'of', 'nagara', 'manchanahalli', 'mahadev', 'she', 'completed', 'her', 'schooling', 'in', 'bishop',

### Review example output of topic 1

In [64]:
topic_1_doc_indices = get_doc_indices(1, max_topic)

In [66]:
female_topic_inference[1]

([(0.013237526, 'production'),
  (0.013074175, 'star'),
  (0.008824809, 'music'),
  (0.008807724, 'episode'),
  (0.008431958, 'law'),
  (0.008051529, 'album'),
  (0.007973641, 'title'),
  (0.007892805, 'role'),
  (0.0073152785, 'singer'),
  (0.00694186, 'story')],
 -2.3011032217963416)

In [67]:
print_matching_words(female_topic_inference[1], female_docs, topic_1_doc_indices)

  FOUND music
  FOUND album
  FOUND title
  FOUND singer
['ayesha', 'erotica', 'background', 'information', 'also', 'known', 'a', 'ayesha', 'erotica', 'ayesha', 'nicole', 'smith', 'born', 'august', 'age', 'origin', 'huntington', 'beach', 'california', 'genre', 'pop', 'rap', 'electro', 'clash', 'occupation', 'singer', 'rapper', 'songwriter', 'producer', 'instrument', 'vocal', 'year', 'active', 'partner', 'brendon', 'jones', 'present', 'musical', 'artist', 'ayesha', 'huntington', 'jones', 'born', 'august', 'known', 'professionally', 'a', 'ayesha', 'erotica', 'is', 'an', 'american', 'producer', 'songwriter', 'and', 'former', 'singer', 'rapper', 'residing', 'in', 'irvine', 'california', 'she', 'is', 'known', 'for', 'her', 'image', 'and', 'music', 'that', 'resembles', '2000s', 'pop', 'hip', 'hop', 'while', 'focusing', 'on', 'theme', 'of', 'eroticism', 'and', 'sexual', 'liberation', 'ayesha', 'began', 'releasing', 'music', 'in', 'under', 'different', 'name', 'and', 'in', 'under', 'ayesha', '

### Review example output of topic 3

In [68]:
topic_3_doc_indices = get_doc_indices(3, max_topic)

In [69]:
female_topic_inference[3]

([(0.021956863, 'award'),
  (0.021349683, 'digital'),
  (0.021242952, 'single'),
  (0.016077897, 'actress'),
  (0.012689218, 'may'),
  (0.012518483, 'article'),
  (0.01017796, 'wikipedia'),
  (0.008870527, 'actor'),
  (0.00824975, 'album'),
  (0.008219023, 'mother')],
 -3.2175911908956514)

In [70]:
print_matching_words(female_topic_inference[3], female_docs, topic_3_doc_indices)

['american', 'sport', 'journalist', 'born', 'abby', 'chin', 'born', 'is', 'an', 'american', 'sport', 'journalist', 'she', 'is', 'boston', 'celtic', 'basketball', 'pregame', 'and', 'postgame', 'reporter', 'a', 'well', 'a', 'court', 'side', 'reporter', 'she', 'studied', 'broadcasting', 'at', 'the', 'university', 'of', 'colorado', 'boulder', 'and', 'later', 'interned', 'in', 'colorado', 'sport', 'radio', 'and', 'television', 'before', 'becoming', 'sideline', 'reporter', 'covering', 'the', 'nba', 'she', 'is', 'married', 'and', 'ha', 'two', 'child', 'is_an', 'a_well', 'she_studied', 'is_married']
*********
['character', 'in', 'battlestar', 'galactica', 'fictional', 'character', 'anastasia', 'dualla', 'battlestar', 'galactica', 'character', 'anastasia', 'dee', 'dualla', 'first', 'appearance', 'miniseries', 'last', 'appearance', 'sometimes', 'great', 'notion', 'portrayed', 'by', 'kandyse', 'mcclure', 'in', 'universe', 'information', 'nickname', 'dee', 'specie', 'human', 'gender', 'female', 't