In [1]:
import pickle
import pandas as pd
import re

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report 
from sklearn.decomposition import NMF,LatentDirichletAllocation
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [3]:
us = pickle.load(open("wikivoyage_text_US.p","rb"))

In [4]:
df = pd.DataFrame.from_dict(us,orient='index')

In [5]:
df = df.reset_index()

In [6]:
df = df.rename(columns=({ 'index' : 'Name'}))

In [7]:
df.head()

Unnamed: 0,Name,text,type,loc
0,AbercrombiePAGE,{{pagebanner|Abercrombie WikiVoyage Banner ND....,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
1,Aberdeen (Maryland)PAGE,{{pagebanner|Aberdeen MD WikiVoyage Banner.jpg...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
2,Aberdeen (South Dakota)PAGE,{{pagebanner|Pagebanner default.jpg|pgname=Abe...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
3,Aberdeen (Washington)PAGE,{{Pagebanner|pgname=Aberdeen |Wikivoyage page ...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
4,AbernathyPAGE,{{pagebanner|Abernathy Texas Wikivoyage Banner...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...


In [8]:
def unwiki(wiki):
        """
       Remove wiki markup from the text.
       """
        wiki = re.sub(r'(?i)\{\{IPA(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
        wiki = re.sub(r'(?i)\{\{Lang(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
        wiki = re.sub(r'\{\{[^\{\}]+\}\}', '', wiki)
        wiki = re.sub(r'(?m)\{\{[^\{\}]+\}\}', '', wiki)
        wiki = re.sub(r'(?m)\{\|[^\{\}]*?\|\}', '', wiki)
        wiki = re.sub(r'(?i)\[\[Category:[^\[\]]*?\]\]', '', wiki)
        wiki = re.sub(r'(?i)\[\[Image:[^\[\]]*?\]\]', '', wiki)
        wiki = re.sub(r'(?i)\[\[File:[^\[\]]*?\]\]', '', wiki)
        wiki = re.sub(r'\[\[[^\[\]]*?\|([^\[\]]*?)\]\]', lambda m: m.group(1), wiki)
        wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', lambda m: m.group(1), wiki)
        wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', '', wiki)
        wiki = re.sub(r'(?i)File:[^\[\]]*?', '', wiki)
        wiki = re.sub(r'\[[^\[\]]*? ([^\[\]]*?)\]', lambda m: m.group(1), wiki)
        wiki = re.sub(r"''+", '', wiki)
        wiki = re.sub(r'(?m)^\*$', '', wiki)
       
        return wiki

In [9]:
def unhtml(html):
        """
       Remove HTML from the text.
       """
        html = re.sub(r'(?i)&nbsp;', ' ', html)
        html = re.sub(r'(?i)<br[ \\]*?>', '\n', html)
        html = re.sub(r'(?m)<!--.*?--\s*>', '', html)
        html = re.sub(r'(?i)<ref[^>]*>[^>]*<\/ ?ref>', '', html)
        html = re.sub(r'(?m)<.*?>', '', html)
        html = re.sub(r'(?i)&amp;', '&', html)
       
        return html

In [249]:
def f(x):
    return unhtml(unwiki(x))

In [250]:
df['text'] = df['text'].apply(f)

In [12]:
vect = CountVectorizer(stop_words="english", max_features = 5000)

In [13]:
features = df['text']

In [14]:
x_features = vect.fit_transform(features)

In [15]:
nmf = NMF(n_components=20, random_state=1, alpha=0, l1_ratio=0)
W = nmf.fit_transform(x_features)
H = nmf.components_

In [16]:
vector_features = vect.get_feature_names()

In [17]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
          
print_top_words(nmf, vector_features, 50)

Topic #0:
seattle downtown washington area lake city south portland center north airport international sound local pacific university northwest puget oregon district way car hill service alaska free bus hour especially place away capitol tacoma public union coffee areas columbia terminal available train offer options sea transit hours states channel restaurants flights
Topic #1:
park national miles glacier area trails located visitor available entrance water camping center mile summer road day visitors valley hiking yosemite areas backcountry east year parks feet permits species trail winter fees mountain include station lake wilderness open lodging 000 canyon campground wildlife 15 fee yellowstone forest hwy food including
Topic #2:
new york city manhattan jersey brooklyn subway bus long queens train island airport hudson midtown world trains staten station river major buses service park ride east route museum bronx borough newark bridge terminal street times home day state 50 express

In [18]:
for topic in range(W.shape[1]):
    #max_t = np.argmax(W[:,topic])
    print "TOPIC: #",topic
    #print features[max_t][0:150]
    
    for t in range(10):
        n = int(-1-t)
        max_2t = W[:,topic].argsort()[n]
        print df['Name'][max_2t]

TOPIC: # 0
SeattlePAGE
Washington (state)PAGE
Pacific NorthwestPAGE
Puget SoundPAGE
Seattle-Tacoma International AirportPAGE
Portland (Oregon)PAGE
OregonPAGE
Kitsap PeninsulaPAGE
TacomaPAGE
Seattle/SouthPAGE
TOPIC: # 1
Glacier National ParkPAGE
Denali National ParkPAGE
Yosemite National ParkPAGE
Big Bend National ParkPAGE
Rocky Mountain National ParkPAGE
Zion National ParkPAGE
Yellowstone National ParkPAGE
Olympic National ParkPAGE
Ohio State ParksPAGE
Biscayne National ParkPAGE
TOPIC: # 2
New York CityPAGE
New York (state)PAGE
ManhattanPAGE
New JerseyPAGE
New EnglandPAGE
New OrleansPAGE
BrooklynPAGE
Mid-Hudson and CatskillsPAGE
Metro New YorkPAGE
New MexicoPAGE
TOPIC: # 3
BuffaloPAGE
Buffalo/South BuffaloPAGE
Buffalo/North BuffaloPAGE
Buffalo/DowntownPAGE
Buffalo/East SidePAGE
Buffalo/West SidePAGE
Buffalo/Elmwood VillagePAGE
Buffalo/Allentown and the Delaware DistrictPAGE
Buffalo National RiverPAGE
ClarencePAGE
TOPIC: # 4
Walt Disney WorldPAGE
Walt Disney World/EpcotPAGE
Walt Disney 

In [19]:
list(df[df['Name']=='Touring prestigious and notable universities in the U.S.PAGE']['loc'])

[u'http://mit.edu/']

In [251]:
pages = df[df['type'] == 'page']

In [252]:
def f(x):
    return re.sub('PAGE', '', x)

In [253]:
pages['Name'] = pages['Name'].apply(f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [38]:
len("//tools.wmflabs.org/wikivoyage/w/poimap2")

40

In [42]:
pages.head()

Unnamed: 0,Name,text,type,loc
0,Abercrombie,\nAbercrombie is a small township in the Red R...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
1,Aberdeen (Maryland),\nAberdeen is in Maryland.\n\n==Understand==\n...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
2,Aberdeen (South Dakota),\nAberdeen is a city in South Dakota. It is h...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
3,Aberdeen (Washington),\n\nAberdeen is a city in the Olympic Peninsul...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...
4,Abernathy,\nAbernathy is a city in the Llano Estacado re...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...


In [254]:
pages = pages[pages['loc'].str.contains("//tools.wmflabs.org/wikivoyage/w/poimap2")==True]

In [66]:
from nltk.corpus import stopwords

In [75]:
StopWords = stopwords.words('english')

In [91]:
StopPlaces = list(pages['Name'])

In [94]:
StopPlaces = map(lambda x: re.sub("\(|\)", '',x), StopPlaces)

In [101]:
StopPlaces = map(lambda x: x.lower().split(), StopPlaces)

In [103]:
StopPlaces = [item for sublist in StopPlaces for item in sublist]

In [105]:
StopWords = StopPlaces + StopWords

In [106]:
StopWords = list(set(StopWords))

In [142]:
vect = CountVectorizer(stop_words=StopWords)

In [143]:
features = pages['text'].apply(lambda x: x.lower())

In [144]:
x_features = vect.fit_transform(features)

In [145]:
nmf = NMF(n_components=20, random_state=1, alpha=0, l1_ratio=0)
W = nmf.fit_transform(x_features)
H = nmf.components_

In [146]:
vector_features = vect.get_feature_names()

In [147]:
print_top_words(nmf, vector_features, 50)

Topic #0:
one near also many well good cta best line time every like although see train bike museum public even neighborhoods blues still take free around along re major streets get several african bars minutes known find neighborhood summer museums go ll music metrorail first restaurants jazz stop food hours clubs
Topic #1:
mdash preserves set civil usfs sites including former first contains near blm president spectacular major well revolution century noted several along famous one famed system commemorates 19th built many native largest british numerous ancient situated years dating visitor ride areas based around birthplace carved preserved wwii protects landscape surrounding life
Topic #2:
first well local via though also downtown years century still around streets one located community nfta visitors time grant among public today industrial bike re events transportation much olmsted best system whose fm however st like want bus class grain irish see number despite service places be

In [148]:
for topic in range(W.shape[1]):
    #max_t = np.argmax(W[:,topic])
    print "TOPIC: #",topic
    #print features[max_t][0:150]
    
    for t in range(10):
        n = int(-1-t)
        max_2t = W[:,topic].argsort()[n]
        print list(pages['Name'])[max_2t]

TOPIC: # 0
Chicago
Washington, D.C.
San Francisco
Minneapolis
New Orleans
Charlotte
Santa Fe (New Mexico)
Milwaukee
Manhattan
Pittsburgh
TOPIC: # 1
United States National Parks
Disneyland
Takoma Park
Big Bend National Park
Finger Lakes
Walt Disney World
Rochester and Suburbs
Montana
Rochester (New York)
Early United States history
TOPIC: # 2
Buffalo
Buffalo/South Buffalo
Buffalo/West Side
Buffalo/East Side
Buffalo/Downtown
Buffalo/Allentown and the Delaware District
Rochester (New York)
Buffalo/Elmwood Village
Buffalo/North Buffalo
Charlotte
TOPIC: # 3
Seattle
Los Angeles
Buffalo/Downtown
Detroit
Los Angeles/Downtown
Portland (Oregon)
Lansing (Michigan)
Milwaukee
Boston/Downtown
Indianapolis
TOPIC: # 4
Big Bend National Park
Grand Canyon
Clarkston (Michigan)
Bandelier National Monument
Cahokia Mounds State Historic Site
Four Corners
Glacier National Park
Holland (Pennsylvania)
Lake Wales
Canyonlands National Park
TOPIC: # 5
Saint Thomas
Vinalhaven
Sonoma Valley
Jefferson (Maine)
San An

In [149]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [150]:
### TF-IDF Vectorizer

In [166]:
StopWords = StopWords + map(lambda x: x.split("/"), StopWords)

In [168]:
StopWords = [item for sublist in StopWords for item in sublist]

In [185]:
TFIDFvect = TfidfVectorizer(stop_words=StopWords)

In [186]:
x_features = TFIDFvect.fit_transform(features)

In [178]:
nmf = NMF(n_components=20, random_state=1, alpha=0, l1_ratio=0)
W = nmf.fit_transform(x_features)
H = nmf.components_

In [190]:
vector_features = TFIDFvect.get_feature_names()

In [180]:
print_top_words(nmf, vector_features, 50)

Topic #0:
get localities dmoz regional buy around wikipedia go next drink see eat carpet known plateau suburb 99 car hotels ski plane german located halfway 83 border allowing pronounced barrier nonstop municipal closest famous camping nicknamed part opposite valleys traveler metroplex cope strand considered 101 referred snowmobiling usa best antiques boating
Topic #1:
many one also areas well even much time however large known like largest people major re popular often find part several mdash especially local although still along summer best good first towns make visitors usually common including though found years travel generally visit number ll restaurants places tourist driving attractions
Topic #2:
destinations safe stay talk get understand itineraries regions around go next drink eat see wikipedia counties seat following mdash part adjacent tier extends largest towns sr _florida includes dade across considered meadow parts police areas goes populated section department portion c

In [181]:
for topic in range(W.shape[1]):
    #max_t = np.argmax(W[:,topic])
    print "TOPIC: #",topic
    #print features[max_t][0:150]
    
    for t in range(10):
        n = int(-1-t)
        max_2t = W[:,topic].argsort()[n]
        print list(pages['Name'])[max_2t]

TOPIC: # 0
Midwest City
North Highlands
Willows
Santa Nella
Pinole
Boulder Creek
Sayre (Oklahoma)
Greers Ferry
Blue Lake
Grove (Oklahoma)
TOPIC: # 1
New Mexico
South (United States of America)
Idaho
Texas
Oregon
Alaska
New England
Hawaii
San Diego
Michigan
TOPIC: # 2
Lucas County
Putnam County (Ohio)
Miami County (Ohio)
Shelby County (Ohio)
Gallia County (Ohio)
Pickaway County
Union County (Ohio)
Clinton County (Ohio)
Guernsey County (Ohio)
Muskingum County
TOPIC: # 3
Tukwila
Mount Kisco
Petersburg (Virginia)
Brentwood (Maryland)
Chelan
Chehalis
Holualoa
Nanuet
Bordentown
Penn Yan
TOPIC: # 4
Dover (Ohio)
Sparta (Kentucky)
Horn Lake
Huntington (Indiana)
Three Oaks
Richfield (Ohio)
Clinton (Mississippi)
Paintsville
Phenix City
Lamar (Missouri)
TOPIC: # 5
Willow Creek
Oakville (California)
Milton-Freewater
Tuolumne County
Hastings (Minnesota)
Hurricane (Utah)
Calistoga
Tok
El Verano
Columbia Falls
TOPIC: # 6
Scotland Neck
Middlefield (Ohio)
Cabarrus County
Safety Harbor
Kalamazoo
Woodstow

### LDA

In [187]:
lda = LatentDirichletAllocation(n_topics=20, random_state=1)
W = lda.fit_transform(x_features)
H = lda.components_

In [191]:
print_top_words(lda,vector_features, 50)

Topic #0:
abiquiú 7181 crafmanship tscycles indianhillsinn kyoo sakakawea get watford highway 34 85 category around randsburg localities regional go next buy dmoz see commons people wikipedia connect 165 us 395 one eat drink elevation sloulin bisects german vehicle redcliff camping feet car ct fees 20 permits population fauna located kayakers backcountry
Topic #1:
allen_park unm alverthorpe mdjwagner 6432 vcr aladdinwymotel swedenborgian daysinn horsham forerunners ookaa revivalist gpage2 vcn cehejde superlative athen luminarias balloonists checkin albuquerqueans luminaria rojo biopark glowdeos rebuked isotopes immature suffix 5141 unser 12345 deprecating kkob alburquerque blower insides smother burque natillas menaul autobuses symptom inflate wholesome get windward 3542 festivals
Topic #2:
1344 boreasinn 8069 98631 boreas long_beach quorum 0454 abita_springs rosiestavern 22067 lenawee hadrian steiner 4458 6979 ione 20680 montevina 0945 95669 renwood 12225 monteverde 6942 watertower ka

In [192]:
for topic in range(W.shape[1]):
    #max_t = np.argmax(W[:,topic])
    print "TOPIC: #",topic
    #print features[max_t][0:150]
    
    for t in range(10):
        n = int(-1-t)
        max_2t = W[:,topic].argsort()[n]
        print list(pages['Name'])[max_2t]

TOPIC: # 0
Albia
Abiquiu
Chicago
Big Bend National Park
United States National Parks
Buffalo
Buffalo/East Side
New York City
Boston
Buffalo/South Buffalo
TOPIC: # 1
Abington Township
Allen Park
Aladdin
Albuquerque
Chicago
Big Bend National Park
United States National Parks
Buffalo
Buffalo/East Side
New York City
TOPIC: # 2
Long Beach (Washington)
Amador County
Abita Springs
Adrian (Michigan)
Addison
Adirondacks
Long Beach (New York)
Chicago
Big Bend National Park
United States National Parks
TOPIC: # 3
Aberdeen (Washington)
Allamuchy
Buffalo/East Side
Chicago
Big Bend National Park
United States National Parks
Buffalo
New York City
Boston
Buffalo/South Buffalo
TOPIC: # 4
Anamoose
Andrews (North Carolina)
Plainview (Texas)
Lamesa
Abernathy
Post
Arvilla
Ames
Bloomington-Normal
Snyder (Texas)
TOPIC: # 5
Chicago
Big Bend National Park
United States National Parks
Buffalo
Buffalo/East Side
New York City
Boston
Buffalo/South Buffalo
Buffalo/North Buffalo
Walt Disney World
TOPIC: # 6
Chicago


In [261]:
pages[pages['Name'] == 'Buffalo/East Side']

Unnamed: 0,Name,text,type,loc
801,Buffalo/East Side,\n\nIf you're a visitor in Buffalo and you ask...,page,//tools.wmflabs.org/wikivoyage/w/poimap2.php?l...


In [263]:
print len(pages['text'].ix[801])

72853


In [235]:
def f(x):
    x = re.sub(r'WikiPedia:([\s\S]*)','', x)
    x = re.sub(r'Dmoz:([\s\S]*)','', x)
    x = re.sub(r'\[([\s\S]*)\]','', x)
    x = re.sub(r'[0-9]','', x)
    x = re.sub(r'\=\=([\S ]*)\=\=', '', x)
    return x

In [236]:
pages['text'] = pages['text'].apply(f)

In [275]:
pages['len'] = map(lambda x: len(x), pages['text'])

In [281]:
pages2 = pages[pages['len']> 1000]

In [287]:
print pages2.ix[3].text



Aberdeen is a city in the Olympic Peninsula region of Washington. The city is the economic center of Grays Harbor County, bordering the cities of Hoquiam and Cosmopolis. The population was 16,461 according to the 2000 census.

==Understand==

Aberdeen is called the "Gateway to the Olympic Peninsula," because of its proximity to the southern end of the peninsula and to Olympic National Park. In recent history Aberdeen has become more famous as being the "Birthplace of Grunge," and the hometown of Nirvana members Kurt Cobain and Krist Novoselic. A sign welcoming visitors to Aberdeen proclaims "Come As You Are" as a tribute to the band. Although it's hard to imagine today, at one point Aberdeen was a notorious Western outpost at the turn of the 20th century, with a number of saloons, brothels, and gambling halls in and around the town. Because of the wild atmosphere associated with these establishments, Aberdeen was nicknamed "The Hellhole of the Pacific."

==Get in==

The nearest major

In [242]:
features = pages2['text'].apply(lambda x: x.lower())
TFIDFvect = TfidfVectorizer(stop_words=StopWords)
x_features = TFIDFvect.fit_transform(features)

In [246]:
nmf = NMF(n_components=10, random_state=1, alpha=0, l1_ratio=0)
W = nmf.fit_transform(x_features)
H = nmf.components_

In [247]:
vector_features = TFIDFvect.get_feature_names()

print_top_words(nmf, vector_features, 50)

Topic #0:
many one also well restaurants around road local get areas summer food popular drive best see known good along large several year visitors time find people available trails shops fishing open take like offers including visit much first re go part hiking places within even largest car stores often restaurant
Topic #1:
small community population people seat plateau along outside nearest car iditarod residents farming towns outskirts drive heritage next nine nm border nearby railroad northwestern walk reservoir sometimes away public festival accommodations minutes nice around fishing lies mecklenburg approximately named berkshires closest get settlement covered hotels founded garrett across foot tomales
Topic #2:
wikipedia seat suburb northwestern name part nine largest metroplex unincorporated hare commons second municipal rd mdash category homes garrett appalachians core suburban wabash triad phone roslindale outlet known railroad northeasternmost founded seven census notably 

In [248]:
for topic in range(W.shape[1]):
    #max_t = np.argmax(W[:,topic])
    print "TOPIC: #",topic
    #print features[max_t][0:150]
    
    for t in range(10):
        n = int(-1-t)
        max_2t = W[:,topic].argsort()[n]
        print list(pages['Name'])[max_2t]

TOPIC: # 0
Colorado
Walt Disney World
San Diego
New Mexico
Alaska
Santa Fe (New Mexico)
Rochester (New York)
New Orleans
Seattle
Buffalo
TOPIC: # 1
Maxwell (Nebraska)
Gustine
Rome (New York)
Eagle (Colorado)
Wellfleet (Nebraska)
Woodside
Stanton (North Dakota)
Pioneer
Chatsworth (Georgia)
Tuscumbia
TOPIC: # 2
Bradley Beach
Highland County
Hanover (Virginia)
Marshall (Texas)
Cahokia (Illinois)
Cabarrus County
Dallas/West Dallas
Riverview
Belmar
Fort Wright
TOPIC: # 3
Kingsburg
Lancaster (Massachusetts)
Chowchilla
Warrensburg
Ashland (Massachusetts)
Lorain
Nevada (Ohio)
Snyder County
East Haddam
Woodstock (Vermont)
TOPIC: # 4
Tonopah
Bridgeport (Texas)
Belmont (New Hampshire)
Redstone
Wellington (Texas)
Wheeler
Ozark (Arkansas)
Clarksville (Arkansas)
Mountainburg
Russellville (Arkansas)
TOPIC: # 5
St. James
Imperial (Missouri)
Gray Summit
Luling (Louisiana)
Sullivan
Kelso (Washington)
St. Francisville
St. George (South Carolina)
St. Joseph
Ferguson
TOPIC: # 6
Lincoln Park (Michigan)
Moun

## TODO

In [60]:
## Consider breaking text into sections (e.g. DO, EAT, etc.)