In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
#%matplotlib inline
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('seaborn-white')
# colorblind safe
plt.style.use('seaborn-colorblind')
plt.style.use('tableau-colorblind10')

# width and precision for f strings
width = 10
precision = 4

# default sizes for plots
# https://matplotlib.org/3.3.0/tutorials/introductory/customizing.html#customizing-with-matplotlibrc-files
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 16
plt.rcParams['legend.fontsize'] = 'large'
plt.rcParams['figure.titlesize'] = 'medium'
plt.rcParams['lines.linewidth'] = 2

# other settings
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option("display.precision", 3)
np.set_printoptions(precision=3, suppress=True)
%load_ext autoreload
%autoreload 2
pd.set_option('display.max_columns', None)
%config IPCompleter.greedy=True

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [3]:
data = pd.read_csv('../data/USdata_est_campgrounds_zip_states_combined_cleaned.csv')
data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12412 entries, 0 to 12411
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Location                   2241 non-null   object 
 1   Name                       12412 non-null  object 
 2   Category                   12412 non-null  object 
 3   Description                12411 non-null  object 
 4   Latitude                   12412 non-null  float64
 5   Longitude                  12412 non-null  float64
 6   Altitude                   12103 non-null  float64
 7   Date verified              12412 non-null  object 
 8   Open                       12412 non-null  object 
 9   Electricity                11609 non-null  object 
 10  Wifi                       11609 non-null  object 
 11  Kitchen                    11609 non-null  object 
 12  Parking                    0 non-null      float64
 13  Restaurant                 11609 non-null  obj

In [5]:
data.groupby(['State'])['Description'].agg('count')

State
AB     10
AK    452
AL     89
AR     96
AZ    886
     ... 
WA    692
WI     83
WV     39
WY    298
YT      6
Name: Description, Length: 69, dtype: int64

In [7]:
data.groupby(['Category'])['Description'].agg('count')

Category
Eco-Friendly                21
Established Campground    3739
Informal Campsite         2745
Short-term Parking          28
Showers                    264
Water                      490
Wild Camping              5124
Name: Description, dtype: int64

In [8]:
data = data[~data['Description'].isna()]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12411 entries, 0 to 12411
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Location                   2240 non-null   object 
 1   Name                       12411 non-null  object 
 2   Category                   12411 non-null  object 
 3   Description                12411 non-null  object 
 4   Latitude                   12411 non-null  float64
 5   Longitude                  12411 non-null  float64
 6   Altitude                   12103 non-null  float64
 7   Date verified              12411 non-null  object 
 8   Open                       12411 non-null  object 
 9   Electricity                11608 non-null  object 
 10  Wifi                       11608 non-null  object 
 11  Kitchen                    11608 non-null  object 
 12  Parking                    0 non-null      float64
 13  Restaurant                 11608 non-null  obj

In [33]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10_000)
desc_tfidf_matrix = tfidf.fit_transform(data['Description'])

In [28]:
# desc_tfidf_matrix.todense() what does this do?

In [34]:
# features = tfidf.vocabulary_.keys()
# or better:
features = tfidf.get_feature_names()
len(features)

10000

In [48]:
kmeans = KMeans(8)
kmeans.fit(desc_tfidf_matrix)
y = kmeans.fit_predict(desc_tfidf_matrix)

top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-21:-1]
print("\ntop 20 features for each cluster with 10,000 max features:\n")
cluster_feats = {}
for num, centroid in enumerate(top_centroids):
    cluster_feats[num] = ', '.join(features[i] for i in centroid)
    print(f"{num}: {', '.join(features[i] for i in centroid)}")


top 20 features for each cluster with 10,000 max features:

0: water, station, potable, dump, free, spigot, hose, gas, park, drinking, rv, available, gallon, parking, area, air, building, right, rest, inside
1: showers, hot, clean, park, shower, laundry, nice, rv, campground, wifi, water, night, pool, site, tent, hookups, bathrooms, free, sites, available
2: place, nice, night, quiet, good, road, great, stay, near, free, overnight, view, big, just, river, sleep, small, park, rv, highway
3: camping, area, spot, night, nice, park, quiet, free, great, river, good, site, beautiful, large, just, camp, highway, big, spots, small
4: parking, lot, overnight, walmart, night, quiet, park, street, signs, stayed, big, area, rv, free, parked, near, good, 24, stay, large
5: road, spot, forest, dirt, spots, just, service, nice, quiet, gravel, good, creek, camping, little, small, pull, right, site, great, camp
6: tables, picnic, pit, toilets, free, area, water, toilet, campground, nice, sites, rings,

In [42]:
cluster_feats

{0: 'water, station, potable, dump, free, spigot, hose, gas, park, area, rv, drinking, available, parking, gallon, nebraska, rest, picnic, nice, air',
 1: 'campground, sites, pit, toilets, tables, lake, picnic, nice, water, free, river, forest, night, site, toilet, rings, national, small, quiet, camping',
 2: 'parking, lot, overnight, walmart, night, quiet, park, signs, street, big, area, rv, stayed, free, good, near, parked, stay, large, 24',
 3: 'showers, park, hot, clean, rv, campground, nice, sites, shower, night, tent, laundry, water, site, wifi, hookups, pool, bathrooms, state, 20',
 4: 'road, spot, forest, dirt, spots, nice, just, quiet, good, gravel, camping, service, small, creek, little, right, pull, site, camp, river',
 5: 'place, night, area, nice, camping, quiet, spot, good, great, free, park, river, highway, beautiful, just, big, view, near, large, camp'}

In [49]:
# count sites of each categories in each cluster 
from collections import Counter
assigned_cluster = kmeans.transform(desc_tfidf_matrix).argmin(axis=1)

for i in range(kmeans.n_clusters):
    cluster = np.arange(0, X.shape[0])[assigned_cluster==i]
    categories = data.iloc[cluster]['Category']
    most_common = Counter(categories).most_common()
    print(f"Cluster {i}:")
    print(f'top features: {cluster_feats[i]}\n')
    for j in range (len(most_common)):
        print(f"     {most_common[j][0]} ({most_common[j][1]} categories)")

Cluster 0:
top features: water, station, potable, dump, free, spigot, hose, gas, park, drinking, rv, available, gallon, parking, area, air, building, right, rest, inside

     Water (398 categories)
     Established Campground (139 categories)
     Informal Campsite (95 categories)
     Wild Camping (32 categories)
     Showers (11 categories)
     Eco-Friendly (1 categories)
Cluster 1:
top features: showers, hot, clean, park, shower, laundry, nice, rv, campground, wifi, water, night, pool, site, tent, hookups, bathrooms, free, sites, available

     Established Campground (728 categories)
     Showers (210 categories)
     Informal Campsite (37 categories)
     Wild Camping (10 categories)
     Water (2 categories)
     Short-term Parking (1 categories)
Cluster 2:
top features: place, nice, night, quiet, good, road, great, stay, near, free, overnight, view, big, just, river, sleep, small, park, rv, highway

     Wild Camping (392 categories)
     Informal Campsite (180 categories)
   