# COMM313 Computational Text Analysis for Communication Research

## Spring 2019 - Annenberg School for Communication, UPenn
### **Instructor**: Matthew Brook O'Donnell, Ph.D. (mbod@asc.upenn.edu)

-----------


# Lab Session 4 



### Setup

In [1]:
import os

import requests
from bs4 import BeautifulSoup
from collections import Counter

In [2]:
def tokenize(text, lowercase=True, strip_chars=''):
    
    if lowercase:
        text=text.lower()
        
    rdict=str.maketrans('','',strip_chars)
    text=text.translate(rdict)
    
    tokens = text.split()
    return tokens

In [61]:
def make_kwic(kw, token_dict, span=4):

    kwic_line = "{: >20}||{: >40}  {}  {}"
    
    kw = kw.split() if type(kw) is str else kw

    
    
    for txt, tokens in token_dict.items():
        for pos, token in enumerate(tokens):
            
            try:
                kw_test = [tokens[pos+i]==w for i,w in enumerate(kw)]

                if sum(kw_test)==len(kw):

                        print(kwic_line.format(txt[:18],
                                               ' '.join(tokens[pos-span:pos]),
                                               ' '.join(tokens[pos:pos+len(kw)]),
                                               ' '.join(tokens[pos+len(kw):pos+span+len(kw)])))
            except:
                pass

## Fairy Tale Corpus

In [4]:
ft_files=os.listdir('data/fairy_tales')

In [7]:
ft_files[10]

'rumpelstiltskin.txt'

In [8]:
for ft in ft_files:
    print(ft)

the_story_of_the_youth_who_went_forth_to_learn_what_fear_was.txt
iron_hans.txt
the_valiant_little_tailor.txt
the_blue_light.txt
the_adventures_of_chanticleer_and_partlet.txt
the_twelve_huntsmen.txt
frederick_and_catherine.txt
the_water_of_life.txt
the_travelling_musicians.txt
first_story.txt
rumpelstiltskin.txt
the_turnip.txt
tom_thumb.txt
clever_hans.txt
the_four_clever_brothers.txt
the_salad.txt
the_king_of_the_golden_mountain.txt
the_queen_bee.txt
mother_holle.txt
the_juniper-tree.txt
clever_elsie.txt
hans_in_luck.txt
briar_rose.txt
cat_and_mouse_in_partnership.txt
the_three_languages.txt
ashputtel.txt
the_dog_and_the_sparrow.txt
second_story.txt
hansel_and_gretel.txt
the_wolf_and_the_seven_little_kids.txt
the_goose-girl.txt
the_pink.txt
the_willow-wren_and_the_bear.txt
the_twelve_dancing_princesses.txt
king_grisly-beard.txt
rapunzel.txt
sweetheart_roland.txt
the_mouse_the_bird_and_the_sausage.txt
little_red-cap_little_red_riding_hood.txt
the_elves_and_the_shoemaker.txt
the_golden_g

In [9]:
ft_tokens={}

for ft_file in ft_files:
    
    ft = ft_file.replace('.txt','')
    
    print('Processing', ft)
    
    text = open('data/fairy_tales/{}'.format(ft_file)).read()
    
    tokens = tokenize(text, lowercase=True, strip_chars='!,?.’;:-`"')
    
    ft_tokens[ft]=tokens

Processing the_story_of_the_youth_who_went_forth_to_learn_what_fear_was
Processing iron_hans
Processing the_valiant_little_tailor
Processing the_blue_light
Processing the_adventures_of_chanticleer_and_partlet
Processing the_twelve_huntsmen
Processing frederick_and_catherine
Processing the_water_of_life
Processing the_travelling_musicians
Processing first_story
Processing rumpelstiltskin
Processing the_turnip
Processing tom_thumb
Processing clever_hans
Processing the_four_clever_brothers
Processing the_salad
Processing the_king_of_the_golden_mountain
Processing the_queen_bee
Processing mother_holle
Processing the_juniper-tree
Processing clever_elsie
Processing hans_in_luck
Processing briar_rose
Processing cat_and_mouse_in_partnership
Processing the_three_languages
Processing ashputtel
Processing the_dog_and_the_sparrow
Processing second_story
Processing hansel_and_gretel
Processing the_wolf_and_the_seven_little_kids
Processing the_goose-girl
Processing the_pink
Processing the_willow-wre

In [15]:
toks=ft_tokens['old_sultan']

freq_list = Counter(toks)

In [19]:
freq_list.most_common(20)

[('the', 59),
 ('and', 53),
 ('to', 25),
 ('his', 21),
 ('a', 17),
 ('he', 16),
 ('sultan', 15),
 ('him', 15),
 ('wolf', 13),
 ('was', 12),
 ('it', 11),
 ('you', 11),
 ('they', 11),
 ('of', 10),
 ('so', 10),
 ('said', 9),
 ('for', 9),
 ('with', 9),
 ('in', 9),
 ('be', 9)]

In [63]:
make_kwic(['old','woman'], ft_tokens)

      the_blue_light||                      again next day the  old woman  took him to the
           the_salad||                        came up a little  old woman  and said to him
           the_salad||                     happens just as the  old woman  said then he shot
           the_salad||                     huntsman did as the  old woman  told him cut open
           the_salad||                    the windows stood an  old woman  with a very beautiful
           the_salad||                      about them now the  old woman  was a witch and
           the_salad||                     she wished then the  old woman  said ‘now is the
           the_salad||                     young ladys and the  old woman  took it away every
           the_salad||                          she did as the  old woman  told her and set
           the_salad||                         taste it as the  old woman  had done and ate
        mother_holle||                        there she saw an  old woman 

In [24]:
make_kwic('little', ft_tokens)

  the_story_of_the_y||                         train the boy a  little  the sexton therefore took
  the_story_of_the_y||                       they had walked a  little  farther to where they
  the_story_of_the_y||                           up the fire a  little  for you when he
  the_story_of_the_y||                    that is certainly my  little  cousin who died only
  the_story_of_the_y||                  finger and cried ‘come  little  cousin come they placed
  the_story_of_the_y||                         will warm you a  little  and went to the
  the_story_of_the_y||                     said the youth ‘see  little  cousin have i not
  the_story_of_the_y||                         him so that the  little  fishes would sprawl about
           iron_hans||                        seen he kept his  little  cap on such a
           iron_hans||                        warm he took his  little  cap off that the
           iron_hans||                  had already fallen and  little  was want

In [62]:
make_kwic('once upon a time', ft_tokens)

           iron_hans||                     iron hans there was  once upon a time  a king who had
      the_blue_light||                    blue light there was  once upon a time  a soldier who for
         first_story||                   first story there was  once upon a time  an old fox with
       the_queen_bee||                      bee two kings sons  once upon a time  went into the world
        mother_holle||                                          once upon a time  there was a widow
          briar_rose||                        a king and queen  once upon a time  reigned in a country
  the_wolf_and_the_s||                   little kids there was  once upon a time  an old goat who
            the_pink||                      the pink there was  once upon a time  a queen to whom
   king_grisly-beard||                      made sport of them  once upon a time  the king held a
   sweetheart_roland||             sweetheart roland there was  once upon a time  a woman who was
  the_mo

## Loop and filter idiom

In [25]:
little_what=[]

for ft in ft_tokens:
    tokens = ft_tokens[ft]
    
    for i, t in enumerate(tokens):
        if t=='little':
            little_what.append(tokens[i+1])

In [27]:
little_what

['the',
 'farther',
 'for',
 'cousin',
 'cousin',
 'and',
 'cousin',
 'fishes',
 'cap',
 'cap',
 'was',
 'cap',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'respect',
 'better',
 'man',
 'mite',
 'tailor',
 'man',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'tailor',
 'to',
 'black',
 'man',
 'man',
 'man',
 'man',
 'black',
 'man',
 'carriage',
 'way',
 'hearse',
 'hillock',
 'while',
 'old',
 'old',
 'ugly',
 'loaves',
 'friend',
 'dwarf',
 'way',
 'frightened',
 'eyes',
 'cat',
 'man',
 'friend',
 'man',
 'child',
 'child',
 'man',
 'gentleman',
 'hut',
 'dwarf',
 'does',
 'friend',
 'man',
 'man',
 'turnips',
 'longer',
 'while',
 'space',
 'matters',
 'boy',
 'as',
 'fellow',
 'man',
 'too',
 'urchin',
 'man',
 'man',
 'urchin',
 'way',
 'ur

In [26]:
Counter(little_what).most_common(15)

[('tailor', 32),
 ('man', 28),
 ('redcap', 14),
 ('marleen', 11),
 ('girl', 11),
 ('son', 10),
 ('dwarf', 9),
 ('while', 8),
 ('peasant', 8),
 ('way', 7),
 ('child', 7),
 ('daughter', 7),
 ('sister', 7),
 ('piece', 7),
 ('old', 6)]

In [28]:
old_what=[]


for ft in ft_tokens:
    tokens = ft_tokens[ft]
    
    for i, t in enumerate(tokens):
        if t=='old':
            old_what.append(tokens[i+1])

In [29]:
Counter(old_what).most_common()

[('woman', 54),
 ('man', 20),
 ('king', 14),
 ('mr', 7),
 ('witch', 6),
 ('cook', 6),
 ('fox', 5),
 ('and', 4),
 ('tower', 3),
 ('lady', 3),
 ('grey', 3),
 ('sultan', 3),
 ('castle', 2),
 ('song', 2),
 ('kings', 2),
 ('ones', 2),
 ('one', 2),
 ('womans', 2),
 ('the', 2),
 ('folks', 2),
 ('goat', 2),
 ('sinner', 2),
 ('dame', 2),
 ('grandfather', 2),
 ('sanna', 2),
 ('fairy', 2),
 ('rags', 1),
 ('however', 1),
 ('mans', 1),
 ('cheese', 1),
 ('dry', 1),
 ('key', 1),
 ('noble', 1),
 ('mousehole', 1),
 ('shape', 1),
 ('ass', 1),
 ('iron', 1),
 ('frock', 1),
 ('but', 1),
 ('world', 1),
 ('friend', 1),
 ('beast', 1),
 ('whoever', 1),
 ('nail', 1),
 ('palace', 1),
 ('as', 1),
 ('soldier', 1),
 ('mop', 1),
 ('custom', 1),
 ('grandmother', 1),
 ('tree', 1),
 ('she', 1),
 ('pedlar', 1),
 ('enemy', 1),
 ('withered', 1),
 ('acquaintance', 1),
 ('mother', 1),
 ('began', 1),
 ('miser', 1),
 ('cushion', 1),
 ('shepherds', 1),
 ('rogue', 1),
 ('threelegged', 1),
 ('walls', 1),
 ('forms', 1),
 ('fairys

In [32]:
what_girl=[]
what_boy=[]
small_what=[]

for ft in ft_tokens:
    tokens = ft_tokens[ft]
    
    for i, t in enumerate(tokens):
        if t=='girl':
            what_girl.append(tokens[i-1])
            
        if t=='boy':
            what_boy.append(tokens[i-1])
            
        if t=='small':
            small_what.append(tokens[i+1])

In [34]:
Counter(what_boy).most_common(10)

[('the', 44),
 ('little', 5),
 ('gardeners', 4),
 ('a', 3),
 ('his', 2),
 ('‘your', 1),
 ('poor', 1),
 ('innocent', 1)]

In [35]:
Counter(what_girl).most_common(10)

[('the', 28),
 ('little', 11),
 ('lazy', 3),
 ('poor', 3),
 ('silly', 2),
 ('pretty', 2),
 ('a', 2),
 ('young', 2),
 ('good', 1),
 ('strange', 1)]

In [41]:
my_list=[]

In [42]:
my_list.append('item1')

In [43]:
my_list

['item1']

In [44]:
my_list.append('item1', 'item2')

TypeError: append() takes exactly one argument (2 given)

In [45]:
my_list.append(['item1', 'item2'])

In [46]:
my_list

['item1', ['item1', 'item2']]

In [47]:
my_list2 = ['item1']

In [48]:
my_list2.extend(['item2','item3'])

In [49]:
my_list2

['item1', 'item2', 'item3']

In [50]:
loved_what = []

for ft in ft_tokens:
    tokens = ft_tokens[ft]
    
    for i, t in enumerate(tokens):
        if t=='loved':
            loved_what.extend(tokens[i+1:i+5])

In [51]:
Counter(loved_what).most_common(20)

[('loved', 24),
 ('all', 10),
 ('me', 8),
 ('best', 8),
 ('of', 8),
 ('her', 5),
 ('and', 4),
 ('dearly', 4),
 ('very', 3),
 ('much', 3),
 ('each', 3),
 ('other', 3),
 ('so', 2),
 ('she', 2),
 ('you', 2),
 ('the', 2),
 ('with', 2),
 ('was', 2),
 ('his', 1),
 ('wife', 1)]

In [64]:
make_kwic('loved',ft_tokens)

  the_story_of_the_y||                     much the young king  loved  his wife and however
  the_twelve_huntsme||                         a bride whom he  loved  very much and when
  the_twelve_huntsme||               former betrothed whom she  loved  so dearly then she
           the_salad||                       will for i always  loved  you very much your
        mother_holle||                 lazy the mother however  loved  the ugly and lazy
    the_juniper-tree||                 and beautiful wife they  loved  each other dearly but
    the_juniper-tree||                      as snow the mother  loved  her daughter very much
    the_juniper-tree||                      was gone my sister  loved  me best of all
    the_juniper-tree||                      was gone my sister  loved  me best of all
    the_juniper-tree||                      was gone my sister  loved  me best of all
    the_juniper-tree||                      was gone my sister  loved  me best of all
    the_juniper-tr

In [37]:
loved_what

['his',
 'wife',
 'and',
 'however',
 'very',
 'much',
 'and',
 'when',
 'so',
 'dearly',
 'then',
 'she',
 'you',
 'very',
 'much',
 'your',
 'the',
 'ugly',
 'and',
 'lazy',
 'each',
 'other',
 'dearly',
 'but',
 'her',
 'daughter',
 'very',
 'much',
 'me',
 'best',
 'of',
 'all',
 'me',
 'best',
 'of',
 'all',
 'me',
 'best',
 'of',
 'all',
 'me',
 'best',
 'of',
 'all',
 'me',
 'best',
 'of',
 'all',
 'me',
 'best',
 'of',
 'all',
 'me',
 'best',
 'of',
 'all',
 'me',
 'best',
 'of',
 'all',
 'her',
 'it',
 'happened',
 'that',
 'them',
 'with',
 'all',
 'the',
 'her',
 'dearly',
 'and',
 'was',
 'each',
 'other',
 'with',
 'all',
 'you',
 'i',
 'am',
 'also',
 'her',
 'thought',
 '‘sooner',
 'than',
 'because',
 'she',
 'was',
 'her',
 'by',
 'everyone',
 'who',
 'looked',
 'each',
 'other',
 'so',
 'dearly']

### Making a corpus of _AirPods_ web pages

 * https://bgr.com/2016/12/28/airpods-review-running-walking-sound-quality-more/
 * https://www.carlytheprepster.com/2018/01/i-love-my-airpods.html
 * https://www.tomsguide.com/us/apple-airpods-worth-buying,review-5837.html
 * https://www.thecut.com/2018/04/apple-airpod-headphones-new-york-subway-trend.html

In [65]:
headers = {'User-Agent': 'Mozilla 5.0'}

In [66]:
page_text = requests.get('https://www.tomsguide.com/us/apple-airpods-worth-buying,review-5837.html', headers=headers)

In [68]:
page_text.text

'<!DOCTYPE html>\n<html lang="en" xmlns="https://www.w3.org/1999/xhtml" xmlns:og="//opengraphprotocol.org/schema/" xmlns:fb="https://www.facebook.com/2008/fbml" class="prod body tgu-site en_us desktop">\n    <head>\n        <!-- META TAGS -->\n        <meta charset="UTF-8"/>\n                                                                                                                                                                \n                            <title>How I Finally Learned to Love Apple\'s AirPods</title>\n        <link rel="canonical" href="https://www.tomsguide.com/us/apple-airpods-worth-buying,review-5837.html" itemprop="url" />                        \n\n        <head itemscope itemtype="https://schema.org/WebSite">\n        <head itemprop="name" content="Tom\'s Guide">\n        <meta name="content-language" content="en" />\n        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n        <meta name="viewport" content="width=device-width, ini

### LexisNexis _AirPods_ export

* Look at the file `LN_AirPods.txt`

* Can we split it into a series of articles like we did with the Grimm's Fairy Tales file