In [1]:
# generic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# notebook settings
%config IPCompleter.greedy=True
%load_ext autoreload
%autoreload 2 
# precision and plot settings
num_precision = 3
np.set_printoptions(precision=num_precision, suppress=True)
pd.set_option('display.float_format', lambda x: f'{x:,.{num_precision}f}')
pd.set_option("display.precision", num_precision)
pd.set_option('display.max_columns', None)

plt.style.use('tableau-colorblind10')
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 16
plt.rcParams['legend.fontsize'] = 'large'
plt.rcParams['figure.titlesize'] = 'medium'
plt.rcParams['lines.linewidth'] = 2

In [2]:
# setup dir and import helper functions
import sys, os
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'src'))
import helper_funcs as my_funcs

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter

In [4]:
data = pd.read_csv('../data/USdata_est_campgrounds_zip_states_combined.csv')
# data.drop(['Unnamed: 0'], axis=1, inplace=True)
data = data[~data['Description'].isna()]

In [5]:
# just try with two categories.. maybe the smaller categories are influencing the clusters too much
data_wild_est = data[data['Category'].isin(['Established Campground', 'Wild Camping'])]

In [6]:
data_wild_est['Category'].unique()

array(['Established Campground', 'Wild Camping'], dtype=object)

In [7]:
data_wild_est.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8863 entries, 0 to 12410
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Location                   1739 non-null   object 
 1   Name                       8863 non-null   object 
 2   Category                   8863 non-null   object 
 3   Description                8863 non-null   object 
 4   Latitude                   8863 non-null   float64
 5   Longitude                  8863 non-null   float64
 6   Altitude                   8637 non-null   float64
 7   Date verified              8863 non-null   object 
 8   Open                       8863 non-null   object 
 9   Electricity                8863 non-null   object 
 10  Wifi                       8863 non-null   object 
 11  Kitchen                    8863 non-null   object 
 12  Parking                    0 non-null      float64
 13  Restaurant                 8863 non-null   obje

In [9]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10_000, max_df=0.9)
all_desc_tfidf_matrix = tfidf.fit_transform(data_wild_est['Description'])
k = 3
clust_names = ['roadsite spot', 'wild campground', 'est campground'] #['service station', 'RV park type', 'pull off camp', 'open camping', 'informal']
wild_est_features = tfidf.get_feature_names()
wild_est_data_cluster_cats, wild_est_cluster_feats = my_funcs.run_kmeans(all_desc_tfidf_matrix, data_wild_est, wild_est_features, k)
wild_est_data_cat_summary, wild_est_cat_df = my_funcs.get_cat_summary(wild_est_data_cluster_cats, wild_est_cluster_feats, clust_names)

In [10]:
wild_est_cluster_feats

{0: 'campground, sites, water, showers, nice, night, park, toilets, lake, site, clean, rv, free, tent, tables, pit, hot, picnic, 20, available',
 1: 'road, spot, forest, dirt, spots, nice, just, good, small, service, quiet, river, right, creek, gravel, camping, little, site, great, camp',
 2: 'parking, place, night, camping, quiet, area, park, nice, free, lot, good, overnight, great, near, spot, street, big, river, just, lake'}

In [11]:
wild_est_data_cat_summary

Unnamed: 0,cluster,category,count,pct_total,top words,cluster name
0,0,Wild Camping,2539,89.91,"campground, sites, water, showers, nice, night...",roadsite spot
2,1,Established Campground,2568,94.66,"road, spot, forest, dirt, spots, nice, just, g...",wild campground
4,2,Wild Camping,2440,73.36,"parking, place, night, camping, quiet, area, p...",est campground


In [12]:
wild_est_cat_df

Unnamed: 0,cluster,category,count,pct_total
0,0,Wild Camping,2539,89.91
1,0,Established Campground,285,10.09
2,1,Established Campground,2568,94.66
3,1,Wild Camping,145,5.34
4,2,Wild Camping,2440,73.36
5,2,Established Campground,886,26.64


In [15]:
data_wild_est.groupby(by='Category').agg('count')['Name']

Category
Established Campground    3739
Wild Camping              5124
Name: Name, dtype: int64

In [58]:
#wild_est_data_cat_summary.to_csv('../data/NLPall_data_cat_summary_FINAL.csv')