In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
#%matplotlib inline
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('seaborn-white')
# colorblind safe
plt.style.use('seaborn-colorblind')
plt.style.use('tableau-colorblind10')

# width and precision for f strings
width = 10
precision = 4

# default sizes for plots
# https://matplotlib.org/3.3.0/tutorials/introductory/customizing.html#customizing-with-matplotlibrc-files
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 16
plt.rcParams['legend.fontsize'] = 'large'
plt.rcParams['figure.titlesize'] = 'medium'
plt.rcParams['lines.linewidth'] = 2

# other settings
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option("display.precision", 3)
np.set_printoptions(precision=3, suppress=True)
%load_ext autoreload
%autoreload 2
pd.set_option('display.max_columns', None)
%config IPCompleter.greedy=True

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from sklearn.metrics import silhouette_score
from collections import Counter

In [3]:
data = pd.read_csv('../data/USdata_est_campgrounds_zip_states_combined_cleaned.csv')
data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [4]:
# maybe just do for state subset I have images for
states = ['CO', 'CA', 'AZ', 'OR', 'UT', 'WA']
data_states = data[data['State'].isin(states)].copy()

In [5]:
len(data_states)

5364

In [6]:
data_states.groupby(['State'])['Description'].agg('count')

State
AZ     886
CA    1698
CO     707
OR     729
UT     651
WA     692
Name: Description, dtype: int64

In [7]:
data_states.groupby(['Category'])['Description'].agg('count')

Category
Eco-Friendly                16
Established Campground    1408
Informal Campsite          887
Short-term Parking          11
Showers                    139
Water                      253
Wild Camping              2649
Name: Description, dtype: int64

In [8]:
data_states = data_states[~data_states['Description'].isna()]

In [9]:
descriptions = data_states['Description']

In [10]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10_000, max_df=0.9)
desc_tfidf_matrix = tfidf.fit_transform(data_states['Description'])

In [11]:
# features = tfidf.vocabulary_.keys()
# or better:
features = tfidf.get_feature_names()
len(features)

9493

In [12]:
# Fit the NMF model
clusters = 6
nmf = NMF(n_components=clusters, max_iter=500, verbose=False, alpha=0.1, l1_ratio=.5)
nmf_fit = nmf.fit(desc_tfidf_matrix)

In [13]:
print('reconstruction error:', nmf_fit.reconstruction_err_)

reconstruction error: 71.14398844974363


In [14]:
for topic_idx, topic in enumerate(nmf_fit.components_):
    print(f'"Topic # {topic_idx}:')
    print(" ".join([features[i] for i in topic.argsort()[:-20 - 1:-1]]))

"Topic # 0:
road spot spots dirt forest just camping great service views right pull flat view camp area large small little gravel
"Topic # 1:
parking lot overnight street signs park quiet area big walmart rv casino stayed free parked sign allowed stay large camping
"Topic # 2:
showers park hot rv clean tent shower wifi laundry pool hookups beach 25 bathrooms site state nice available paid campground
"Topic # 3:
water free station potable dump hose gas spigot drinking available gallon air rest area tank bathrooms propane toilets fountain filling
"Topic # 4:
sites campground river pit toilets tables nice lake picnic forest toilet site pits rings creek national free small camping established
"Topic # 5:
night place good quiet nice close highway stayed spend ok near sleep stay traffic noise little stop view noisy big


In [15]:
topic_feats = {}
for num, topic in enumerate(nmf_fit.components_):
    topic_feats[num] = " ".join([features[i] for i in topic.argsort()[:-20 - 1:-1]])

In [16]:
topic_feats

{0: 'road spot spots dirt forest just camping great service views right pull flat view camp area large small little gravel',
 1: 'parking lot overnight street signs park quiet area big walmart rv casino stayed free parked sign allowed stay large camping',
 2: 'showers park hot rv clean tent shower wifi laundry pool hookups beach 25 bathrooms site state nice available paid campground',
 3: 'water free station potable dump hose gas spigot drinking available gallon air rest area tank bathrooms propane toilets fountain filling',
 4: 'sites campground river pit toilets tables nice lake picnic forest toilet site pits rings creek national free small camping established',
 5: 'night place good quiet nice close highway stayed spend ok near sleep stay traffic noise little stop view noisy big'}

In [17]:
assigned_cluster = nmf.fit_transform(desc_tfidf_matrix).argmin(axis=1) # get the cluster assigned to each row (site)

In [18]:
for i in range(nmf_fit.components_.shape[0]): # for each cluster
    cluster = np.arange(0, desc_tfidf_matrix.shape[0])[assigned_cluster==i] # the cluster for the feat (row indices)
    categories = data_states.iloc[cluster]['Category'] # get the categories from each row (site) in that cluster)
    most_common = Counter(categories).most_common() # count up
    print(f"Cluster {i}:")
    print(f'top features: {topic_feats[i]}\n')
    for j in range (len(most_common)):
        print(f"     {most_common[j][0]} ({most_common[j][1]} site count for this category)")

Cluster 0:
top features: road spot spots dirt forest just camping great service views right pull flat view camp area large small little gravel

     Established Campground (440 site count for this category)
     Informal Campsite (294 site count for this category)
     Water (172 site count for this category)
     Wild Camping (123 site count for this category)
     Showers (93 site count for this category)
     Eco-Friendly (8 site count for this category)
     Short-term Parking (3 site count for this category)
Cluster 1:
top features: parking lot overnight street signs park quiet area big walmart rv casino stayed free parked sign allowed stay large camping

     Wild Camping (1410 site count for this category)
     Established Campground (645 site count for this category)
     Informal Campsite (126 site count for this category)
     Water (41 site count for this category)
     Showers (29 site count for this category)
     Eco-Friendly (4 site count for this category)
Cluster 2:
to

In [126]:
kmeans = KMeans(5)
kmeans.fit(desc_tfidf_matrix)
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-21:-1]
print("\ntop 20 features for each cluster with 10,000 max features:\n")
cluster_feats = {}
for num, centroid in enumerate(top_centroids):
    cluster_feats[num] = ', '.join(features[i] for i in centroid)
    print(f"{num}: {', '.join(features[i] for i in centroid)}")


top 20 features for each cluster with 10,000 max features:

0: water, station, potable, free, dump, hose, spigot, park, gas, gallon, drinking, showers, bathrooms, available, area, rv, rest, toilets, hot, use
1: campground, sites, nice, showers, toilets, night, river, water, pit, tables, site, lake, park, tent, free, clean, forest, picnic, available, rv
2: road, spot, forest, spots, dirt, just, nice, service, camping, right, quiet, great, pull, views, good, river, gravel, little, camp, view
3: place, night, nice, area, camping, free, quiet, good, park, great, spot, rv, highway, just, near, big, showers, lots, beautiful, site
4: parking, lot, overnight, quiet, street, night, park, signs, big, area, free, rv, stayed, walmart, parked, good, casino, road, stay, near


In [127]:
assigned_cluster = kmeans.fit_transform(desc_tfidf_matrix).argmin(axis=1) # get the cluster assigned to each row (site)

In [128]:
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, desc_tfidf_matrix.shape[0])[assigned_cluster==i]
    categories = data_states.iloc[cluster]['Category']
    most_common = Counter(categories).most_common()
    print(f"Cluster {i}:")
    print(f'top features: {cluster_feats[i]}\n')
    for j in range (len(most_common)):
        print(f"     {most_common[j][0]} ({most_common[j][1]} sites categoried)")

Cluster 0:
top features: water, station, potable, free, dump, hose, spigot, park, gas, gallon, drinking, showers, bathrooms, available, area, rv, rest, toilets, hot, use

     Wild Camping (1036 sites categoried)
     Informal Campsite (354 sites categoried)
     Established Campground (285 sites categoried)
     Showers (31 sites categoried)
     Water (19 sites categoried)
     Eco-Friendly (15 sites categoried)
     Short-term Parking (3 sites categoried)
Cluster 1:
top features: campground, sites, nice, showers, toilets, night, river, water, pit, tables, site, lake, park, tent, free, clean, forest, picnic, available, rv

     Water (221 sites categoried)
     Established Campground (68 sites categoried)
     Informal Campsite (37 sites categoried)
     Wild Camping (30 sites categoried)
     Showers (13 sites categoried)
     Eco-Friendly (1 sites categoried)
     Short-term Parking (1 sites categoried)
Cluster 2:
top features: road, spot, forest, spots, dirt, just, nice, service, 

In [129]:
# save in dict
cluster_cats = {}

for i in range(kmeans.n_clusters):
    cluster = np.arange(0, desc_tfidf_matrix.shape[0])[assigned_cluster==i]
    categories = data_states.iloc[cluster]['Category']
    most_common = Counter(categories).most_common()
    cluster_cats[i] = {}
    for j in range (len(most_common)):
        cluster_cats[i].update({most_common[j][0] : most_common[j][1]})

In [139]:
cat_df = pd.concat({k: pd.DataFrame.from_dict(v, 'index') for k, v in cluster_cats.items()}, axis=0).reset_index()

In [140]:
cat_df.columns = ['cluster', 'category', 'count']

In [141]:
cat_df['cluster'] = cat_df['cluster'].astype('str')

In [142]:
cat_df['pct_total'] = round(cat_df['count'].div(cat_df.groupby('cluster')['count'].transform('sum'))*100, 2)

In [143]:
max_indices = cat_df.groupby(['cluster'])['pct_total'].transform(max) == cat_df['pct_total']

In [151]:
cat_max = cat_df[max_indices].copy()

In [153]:
cat_max['top words'] = pd.Series(cluster_feats).values

In [156]:
cluster_feats

{0: 'water, station, potable, free, dump, hose, spigot, park, gas, gallon, drinking, showers, bathrooms, available, area, rv, rest, toilets, hot, use',
 1: 'campground, sites, nice, showers, toilets, night, river, water, pit, tables, site, lake, park, tent, free, clean, forest, picnic, available, rv',
 2: 'road, spot, forest, spots, dirt, just, nice, service, camping, right, quiet, great, pull, views, good, river, gravel, little, camp, view',
 3: 'place, night, nice, area, camping, free, quiet, good, park, great, spot, rv, highway, just, near, big, showers, lots, beautiful, site',
 4: 'parking, lot, overnight, quiet, street, night, park, signs, big, area, free, rv, stayed, walmart, parked, good, casino, road, stay, near'}

In [157]:
cat_max

Unnamed: 0,cluster,category,count,pct_total,top words
0,0,Wild Camping,1036,59.44,"water, station, potable, free, dump, hose, spi..."
7,1,Water,221,59.57,"campground, sites, nice, showers, toilets, nig..."
14,2,Informal Campsite,385,55.0,"road, spot, forest, spots, dirt, just, nice, s..."
20,3,Wild Camping,1247,87.45,"place, night, nice, area, camping, free, quiet..."
24,4,Established Campground,922,82.1,"parking, lot, overnight, quiet, street, night,..."


In [158]:
cat_max['cluster name'] = ['service station', 'RV park type', 'pull off camp', 'open camping', 'informal']

In [159]:
cat_max

Unnamed: 0,cluster,category,count,pct_total,top words,cluster name
0,0,Wild Camping,1036,59.44,"water, station, potable, free, dump, hose, spi...",service station
7,1,Water,221,59.57,"campground, sites, nice, showers, toilets, nig...",RV park type
14,2,Informal Campsite,385,55.0,"road, spot, forest, spots, dirt, just, nice, s...",pull off camp
20,3,Wild Camping,1247,87.45,"place, night, nice, area, camping, free, quiet...",open camping
24,4,Established Campground,922,82.1,"parking, lot, overnight, quiet, street, night,...",informal


In [161]:
def get_cat_summary(cat_dict):
    cat_df = pd.concat({k: pd.DataFrame.from_dict(v, 'index') for k, v in cat_dict.items()}, axis=0).reset_index()
    cat_df.columns = ['cluster', 'category', 'count']
    cat_df['cluster'] = cat_df['cluster'].astype('str')
    cat_df['pct_total'] = round(cat_df['count'].div(cat_df.groupby('cluster')['count'].transform('sum'))*100, 2)
    max_indices = cat_df.groupby(['cluster'])['pct_total'].transform(max) == cat_df['pct_total']
    cat_max = cat_df[max_indices].copy()
    cat_max['top words'] = pd.Series(cluster_feats).values
    cat_max['cluster name'] = ['service station', 'RV park type', 'pull off camp', 'open camping', 'informal']
    return cat_max

In [166]:
data = pd.read_csv('../data/USdata_est_campgrounds_zip_states_combined_cleaned.csv')
data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
data = data[~data['Description'].isna()]

In [164]:
def run_kmeans(X):
    kmeans = KMeans(5)
    kmeans.fit(desc_tfidf_matrix)
    top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-21:-1]
    cluster_feats = {}
    for num, centroid in enumerate(top_centroids):
        cluster_feats[num] = ', '.join(features[i] for i in centroid)
        #print(f"{num}: {', '.join(features[i] for i in centroid)}")

    assigned_cluster = kmeans.fit_transform(desc_tfidf_matrix).argmin(axis=1) # get the cluster assigned to each row (site)

    # save in dict
    cluster_cats = {}

    for i in range(kmeans.n_clusters):
        cluster = np.arange(0, desc_tfidf_matrix.shape[0])[assigned_cluster==i]
        categories = data_states.iloc[cluster]['Category']
        most_common = Counter(categories).most_common()
        cluster_cats[i] = {}
        for j in range (len(most_common)):
            cluster_cats[i].update({most_common[j][0] : most_common[j][1]})
    return cluster_cats

In [167]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10_000, max_df=0.9)
all_desc_tfidf_matrix = tfidf.fit_transform(data['Description'])

In [168]:
features = tfidf.get_feature_names()
len(features)

10000

In [169]:
all_data_cluster_cats = run_kmeans(all_desc_tfidf_matrix)

In [170]:
all_data_cat_summary = get_cat_summary(all_data_cluster_cats)

In [171]:
all_data_cat_summary

Unnamed: 0,cluster,category,count,pct_total,top words,cluster name
0,0,Informal Campsite,381,54.98,"water, station, potable, free, dump, hose, spi...",service station
6,1,Established Campground,924,82.13,"campground, sites, nice, showers, toilets, nig...",RV park type
10,2,Wild Camping,1259,87.61,"road, spot, forest, spots, dirt, just, nice, s...",pull off camp
14,3,Wild Camping,1032,59.14,"place, night, nice, area, camping, free, quiet...",open camping
21,4,Water,221,60.88,"parking, lot, overnight, quiet, street, night,...",informal


In [172]:
all_data_cat_summary.to_csv('../data/NLPall_data_cat_summary.csv')

In [78]:
# Fit the NMF model
clusters = 6
nmf = NMF(n_components=clusters, max_iter=500, verbose=False, alpha=0.1, l1_ratio=.5)
nmf_fit = nmf.fit(desc_tfidf_matrix)

In [79]:
for topic_idx, topic in enumerate(nmf_fit.components_):
    print(f'"Topic # {topic_idx}:')
    print(" ".join([features[i] for i in topic.argsort()[:-20 - 1:-1]]))

"Topic # 0:
road spot forest spots camping dirt just area gravel right service small great pull camp creek views little flat view
"Topic # 1:
parking lot overnight walmart quiet signs night area big street park stayed large trucks parked 24 free stay noisy rv
"Topic # 2:
park showers rv hot clean tent state wifi laundry site shower hookups bathrooms camping sites 20 pool hook beach 25
"Topic # 3:
night place nice good quiet stay spend highway view near sleep close great little stop spot clean really noisy ok
"Topic # 4:
campground sites pit lake toilets tables picnic river free nice toilet forest rings site national pits table campsites established 10
"Topic # 5:
water station potable dump free gas spigot hose drinking available rest area restrooms tank air building electric gallon propane right


In [80]:
topic_feats = {}
for num, topic in enumerate(nmf_fit.components_):
    topic_feats[num] = " ".join([features[i] for i in topic.argsort()[:-20 - 1:-1]])

In [81]:
assigned_cluster = nmf.fit_transform(desc_tfidf_matrix).argmin(axis=1) # get the cluster assigned to each row (site)

In [83]:
for i in range(nmf_fit.components_.shape[0]): # for each cluster
    cluster = np.arange(0, desc_tfidf_matrix.shape[0])[assigned_cluster==i] # the cluster for the feat (row indices)
    categories = data.iloc[cluster]['Category'] # get the categories from each row (site) in that cluster)
    most_common = Counter(categories).most_common() # count up
    print(f"Cluster {i}:")
    print(f'top features: {topic_feats[i]}\n')
    for j in range (len(most_common)):
        print(f"     {most_common[j][0]} ({most_common[j][1]} site count for this category)")

Cluster 0:
top features: road spot forest spots camping dirt just area gravel right service small great pull camp creek views little flat view

     Established Campground (1284 site count for this category)
     Informal Campsite (935 site count for this category)
     Water (283 site count for this category)
     Wild Camping (255 site count for this category)
     Showers (178 site count for this category)
     Eco-Friendly (8 site count for this category)
     Short-term Parking (6 site count for this category)
Cluster 1:
top features: parking lot overnight walmart quiet signs night area big street park stayed large trucks parked 24 free stay noisy rv

     Wild Camping (2430 site count for this category)
     Established Campground (1583 site count for this category)
     Informal Campsite (276 site count for this category)
     Water (88 site count for this category)
     Showers (46 site count for this category)
     Eco-Friendly (7 site count for this category)
Cluster 2:
top f