In [1]:
# snippet from ~/Library/Jupyter/nbextensions/snippets/snippets.json
# basic
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import csv
import os, sys
import dill
import seaborn as sns

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

RANDOM_STATE = 777

# Set up Pipeline and Model

## Import Stems

In [2]:
alnc_path = '/Users/nknezek/Documents/Insight_local/project/data/ALNC/Cleaned/NewspaperMapCorpus_03_03_2014_cleaned/'

stop_stems = dill.load(open("/Users/nknezek/Documents/Insight_local/project/data/wordlists/stop_words/stop_stems.m",'rb'))
vocab_stems = dill.load(open("/Users/nknezek/Documents/Insight_local/project/data/wordlists/SCOWL-custom/vocab_stems.m",'rb'))

## Make TF-IDF vectorizer with tokenizer and stemmer

In [3]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stemmer = SnowballStemmer('english')

def tokenize(text):
    tokens = tokenizer.tokenize(text.lower())
    stems = [stemmer.stem(x) for x in tokens]
    return stems

# vectorize the articles and compute count matrix
tfidf_vectorizer = TfidfVectorizer(vocabulary=list(vocab_stems), stop_words=stop_stems, tokenizer=tokenize)

## Fit to sampling of articles

### Get filenames of articles and states and towns

In [4]:
states = os.listdir(alnc_path)
print('{} states found'.format(len(states)))

51 states found


In [5]:
towns = {}
town_counts = []
for st in states:
    towns[st] = os.listdir(alnc_path+st+'/')
    town_counts.append(len(towns[st]))
#     print('{} towns in {}'.format(len(towns[st]),st))

In [6]:
state = 'AK/'
town = 'Anchorage/'
statetown = state + town

def get_filenames_for_town(alnc_path, statetown, verbose=False):
    papers = os.listdir(alnc_path + statetown)
    files_list = []
    for p in papers:
        files_p = os.listdir(alnc_path + statetown + p)
        files_list += [alnc_path + statetown + p + '/' + x for x in files_p]
    if verbose:
        print('{} files found for {}'.format(len(files_list), statetown))
    return files_list

file_list = get_filenames_for_town(alnc_path, statetown)
file_list[:4]

['/Users/nknezek/Documents/Insight_local/project/data/ALNC/Cleaned/NewspaperMapCorpus_03_03_2014_cleaned/AK/Anchorage/www.thebristolbaytimes.com/2014-1-28-article38.cleaned',
 '/Users/nknezek/Documents/Insight_local/project/data/ALNC/Cleaned/NewspaperMapCorpus_03_03_2014_cleaned/AK/Anchorage/www.thebristolbaytimes.com/2014-1-28-article28.cleaned',
 '/Users/nknezek/Documents/Insight_local/project/data/ALNC/Cleaned/NewspaperMapCorpus_03_03_2014_cleaned/AK/Anchorage/www.thebristolbaytimes.com/2013-9-21-article1.cleaned',
 '/Users/nknezek/Documents/Insight_local/project/data/ALNC/Cleaned/NewspaperMapCorpus_03_03_2014_cleaned/AK/Anchorage/www.thebristolbaytimes.com/2014-1-21-article60.cleaned']

In [7]:
def load_one_file(file):
    with open(file,'r') as f:
        raw_text = f.read()
    return raw_text

def make_corpus(file_list):
    for file in file_list:
        yield load_one_file(file)


# Test the TF-IDF fit on 3 cities

In [8]:
states2fit = ['AK/','CA/','TX/']
towns2fit = {}
towns2fit['AK/'] = ['Anchorage']
towns2fit['CA/'] = ['Berkeley']
towns2fit['TX/'] = ['Denton']

In [14]:
files_to_fit = []
for st in states2fit:
    for tn in towns2fit[st]:
        statetown = st +tn+'/'
        file_list = get_filenames_for_town(alnc_path, statetown)
        for f in file_list:
            files_to_fit.append(f)

In [15]:
corpus = make_corpus(files_to_fit)
tfidf_vectorizer = tfidf_vectorizer.fit(corpus)

### Save the trained tfidf vectorizer

In [16]:
dill.dump(tfidf_vectorizer,open('tfidf_vectorizer.m','wb'))

### Test to make sure it transforms

In [None]:
corpus = make_corpus(files_to_fit)
tfidf_matrix = tfidf_func.transform(corpus)
tfidf_matrix.shape