In [1]:
# snippet from ~/Library/Jupyter/nbextensions/snippets/snippets.json
# basic
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import csv
import os, sys
import dill
import seaborn as sns

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
# re_tokenizer = RegexpTokenizer(r'\w+')
re_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

# Create Stopwords

In [2]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet
stop_words = set(stopwords.words('english'))

## Add Months to stopwords

In [3]:
month_names = [
    'january',
    'february',
    'march',
    'april',
    'may',
    'june',
    'july',
    'august',
    'september',
    'october',
    'november',
    'december',
]
month_abbrv = [
    'jan',
    'feb',
    'mar',
    'apr',
    'may',
    'jun',
    'jul',
    'aug',
    'sep',
    'oct',
    'nov',
    'dec',
    'sept',
]
day_names = [
    'sunday',
    'monday',
    'tuesday',
    'wednesday',
    'thursday',
    'friday',
    'saturday',
    'sunday',
]
day_abbrv = [
    'sun',
    'mon',
    'tue',
    'wed',
    'thu',
    'fri',
    'sat',
    'sun',
    'tues',
    'weds',
    'th',
    'fr',
]

stop_words = stop_words.union(month_names)
stop_words = stop_words.union(month_abbrv)
stop_words = stop_words.union(day_names)
stop_words = stop_words.union(day_abbrv)

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stemmer = SnowballStemmer('english')

In [4]:
len(stop_words)

221

## Add city names to stopwords

In [5]:
alnc_path = '/Users/nknezek/Documents/Insight_local/project/data/ALNC/Cleaned/NewspaperMapCorpus_03_03_2014_cleaned/'

states = os.listdir(alnc_path)
print('{} states found'.format(len(states)))

for st in states:
    stop_words.add(st)
    towns = os.listdir(alnc_path + st + '/')
    for town in towns:
        stop_words.add(town.replace('_', ' '))
len(stop_words)

51 states found


1948

## Add states and demonyms to stopwords

In [6]:
states = [
    'Alabama',
    'Alaska',
    'Arizona',
    'Arkansas',
    'California',
    'Colorado',
    'Connecticut',
    'Delaware',
    'Florida',
    'Georgia',
    'Hawaii',
    'Idaho',
    'Illinoi',
    'Indiana',
    'Iowa',
    'Kansas',
    'Kentucky',
    'Louisiana',
    'Maine',
    'Maryland',
    'Massachusetts',
    'Michigan',
    'Minnesota',
    'Mississippi',
    'Missouri',
    'Montana',
    'Nebraska',
    'Nevada',
    'New Hampshire',
    'New Jersey',
    'New Mexico',
    'New York',
    'North Carolina',
    'North Dakota',
    'Ohio',
    'Oklahoma',
    'Oregon',
    'Pennsylvania',
    'Rhode Island',
    'South Carolina',
    'South Dakota',
    'Tennessee',
    'Texas',
    'Utah',
    'Vermont',
    'Virginia',
    'Washington',
    'West Virginia',
    'Wisconsin',
    'Wyoming',
]
stop_words = stop_words.union(states)

In [7]:
demonyms = [
    'Alabamans ',
    'Alabamians',
    'Alaskans',
    'Arizonans',
    'Arkansans',
    'Californians',
    'Coloradans',
    'Coloradoans',
    'Connecticuters',
    'Nutmeggers',
    'Delawareans',
    'Floridians',
    'Georgians',
    'Hawaiian',
    'Idahoans',
    'Illinoisans',
    'Illinoisians',
    'Indianan',
    'Hoosiers',
    'Iowans',
    'Kansans',
    'Kentuckians',
    'Louisianans',
    'Louisianians',
    'Mainers',
    'Down Easters',
    'Marylanders',
    'Massachusettsan',
    'Bay Staters',
    'Michiganders',
    'Michiganians',
    'Minnesotans',
    'Mississippians',
    'Missourians',
    'Montanans',
    'Nebraskans',
    'Nevadans',
    'New Hampshirites',
    'New Jerseyites',
    'New Jerseyans',
    'New Mexicans',
    'New Yorkers',
    'North Carolinians',
    'Tarheels',
    'North Dakotans',
    'Ohioans',
    'Oklahomans',
    'Oklahomians',
    'Sooners',
    'Okies',
    'Oregonians',
    'Pennsylvanians',
    'Rhode Islanders',
    'South Carolinians',
    'South Dakotans',
    'Tennesseeans',
    'Texans',
    'Utahns',
    'Utahans',
    'Vermonters',
    'Virginians',
    'Washingtonians',
    'West Virginians',
    'Wisconsinites',
    'Wyomingites',
]
stop_words = stop_words.union(demonyms)

## Add journalism words to stopwords

In [8]:
jwords = [
    'print',
    'writer',
    'author',
    'daily',
    'times',
    'record',
    'chronicle',
    'font',
    'publish',
    'story',
    'planet',
    'comment',
    'blotter',
    'via',
    'photo',
    'picture',
    'donate',
    'staff',
    'reporter',
    'www',
    'com',
    'pm',
    'am',
    'email',
    'update',
    'columnist',
]
stop_words = stop_words.union(jwords)

## Get extra words from csv


In [9]:
df = pd.read_csv('./words_to_filter.csv', header=None)

stop_words = stop_words.union(set(df[0]))


## Newspaper Names

In [10]:
paper_names = dill.load(open('../../../20city/papername_stop_dict.m','rb'))

for k,v in paper_names.items():
    stop_words = stop_words.union(set(v))

# Convert words to stems

In [11]:
stop_stems = set()
for w in stop_words:
    ts = tokenizer.tokenize(w.lower())
    for t in ts:
        stm = stemmer.stem(t)
        if stm not in stop_stems:
            stop_stems.add(stm)
len(stop_stems)

2060

In [12]:
dill.dump(stop_stems,open("/Users/nknezek/Documents/Insight_local/project/data/wordlists/stop_words/stop_stems.m",'wb'))
dill.dump(stop_words,open("/Users/nknezek/Documents/Insight_local/project/data/wordlists/stop_words/stop_words.m",'wb'))