# Popular Words

In this example you will learn how to:

1. Read the article data
2. Given some date range, find articles in that range
3. Find the frequent words in the specified articles
4. Display results

In [None]:
# import dependencies 
from os import listdir
from os.path import isfile, join
from datetime import datetime, timedelta
import csv

## 1. Read article data from corpus

In [None]:
# path to corpus directory; change this value as necessary
directory_path = '../corpus'

# read all files
all_files = [f for f in listdir(directory_path) if isfile(join(directory_path, f))]

# create a list to hold data
articles = []

# iterate over each csv file
for f in all_files:

    rows = 0

    # open the file for reading
    with open(join(directory_path, f)) as csvfile:

        # read file contents
        readCSV = csv.reader(csvfile, delimiter=',')

        for row in readCSV:

            # skip header row
            if rows > 0:
                articles.append(row)

            rows += 1

print('Sanity check! Got', len(articles), 'articles.')            

## 1.2. Add some helper functions to make life easier later

The purpose of each of these functions is described in the docstring

In [None]:
def parse_date(string_date):
    """This function converts string timestamp to a datetime, and zeros out the time (hours, min, etc.)"""
    date_format = '%Y-%m-%dT%H:%M:%S.%f%z' if 'T' in string_date else '%Y-%m-%d %H:%M:%S'

    return (datetime.strptime(string_date, date_format)) \
        .replace(hour=0, minute=0, second=0, microsecond=0).date()


def longest_base_word(word_list):
    """Given a list of words, this function finds the longest common sequence from
    the beginning that is common between each words in the list"""

    unique_words = list(set(word_list))

    # if list contains exactly 1 words, return that
    if len(unique_words) == 1:
        return unique_words[0]

    max_len, res = -1, ['']

    # find longest word
    for ele in word_list:
        if len(ele) > max_len:
            max_len = len(ele)

    # loop substrings        
    for idx in range(1, max_len):
        tmp = list(set([t[0:idx] for t in word_list]))
        if len(tmp) > 1:
            break
        res = tmp
    return res[0]


def most_frequent(word_list, n):
    """Given a list of non-unique strings, return top n most frequent strings"""
    
    words = dict.fromkeys(list(set(word_list)), 0)
    
    for k in words.keys():
        words[k] = len([w for w in word_list if w == k])
    
    return [i[1] for i in sorted([(v, k) for k, v in words.items()], reverse=True)][0:n]


def strip_nonalpha(str):
    """Remove non-alphabetic characters, except spaces"""
    return ''.join([c for c in list(str.lower()) if c in [' ', 'ä', 'ö'] or (ord('a') <= ord(c) <= ord('z'))])

## 2. Create date range for analysis

Here we are going to study articles from March 12, 2020 (because it is the last full date of data in the corpus at the time of writing this), and going backwards by 90 days. You may adjust these limits however you want.

In [None]:
# start from March 12, 2020
base = datetime(2020, 3, 12)

# make a list of dates 90 days prior; zeroing out the time part
num_days = 90

date_list = [(base - timedelta(days=x))
             .replace(hour=0, minute=0, second=0, microsecond=0).date() 
             for x in range(0, num_days)]

print('Sanity check! Analyzing dates:', date_list[0], 'through', date_list[-1]) 

## 3. Find words in article titles occurring within the date range

In [None]:
# Specify minimum word length
# we will ignore all words that are shorter than this length!
# This will eliminate many stop words
min_word_len = 4

# ignore rare words that occurred fewer times than this:
# (adjust this limit as necessary)
min_occurrence = 100

# Make a dictionary to hold temporary results
words = {}

# loop articles where publish date is within date range
for article in [a for a in articles if parse_date(a[0]) in date_list]:

    # sanitize article title, split into words
    title_words = strip_nonalpha(article[1]).split(' ')

    for w in title_words:
        
        # ignore short words
        if len(w) < min_word_len:
            continue
            
        # get base word    
        _key = w[0:min_word_len]
        
        # add word to our dictionary
        if _key in words:
            words[_key].append(w)
        else:
            words[_key] = [w]

# Make a list of most popular words
# key (k) - represents a base word, length of this key is equal to min_word_len
# value (v) - a list of actual words that match that base word, and may contain duplicates
# len(v) - the total number of occurrences that were found within the specified date range
# reverse=True - gives use descending order, i.e. highest frequency items first
top_words = sorted([(len(v), k, v) for k, v in words.items() if len(v) > min_occurrence], reverse=True)            

## 4. Display results

In [None]:
for i in range(0, len(top_words)):
    
    # unpack values
    (freq, k, v) = top_words[i]
    
    # longest commmon base word
    base_word = longest_base_word(v)
    # number of diffrent words that share the same base
    unique_word_count = len(list(set(v)))
    # top 5 most frequent words
    full_words = most_frequent(v, 5)
    
    # format output
    display = lambda x,y : str(x).ljust(y, ' ')

    # display row of data
    print(display(str(i + 1) + '.', 5),
          display(base_word, 10),
          display(freq, 5),
          display(unique_word_count, 5),
          ', '.join(full_words))

## Top 20 Topics

In [None]:
n = 1

print('IL Suosituimmat jutun aiheet', date_list[-1], '-', date_list[0], '\n')

for entry in top_words[0: 20]:
    print(str(n).ljust(3,' '), ',\n'.join(most_frequent(entry[-1], 1)))
    n += 1

# [&laquo; Previous Lab](getting_started.ipynb)