In [1]:
# generic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# notebook settings
%config IPCompleter.greedy=True
%load_ext autoreload
%autoreload 2 
# precision and plot settings
num_precision = 3
np.set_printoptions(precision=num_precision, suppress=True)
pd.set_option('display.float_format', lambda x: f'{x:,.{num_precision}f}')
pd.set_option("display.precision", num_precision)
pd.set_option('display.max_columns', None)

plt.style.use('tableau-colorblind10')
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 16
plt.rcParams['legend.fontsize'] = 'large'
plt.rcParams['figure.titlesize'] = 'medium'
plt.rcParams['lines.linewidth'] = 2

In [2]:
# setup dir and import helper functions
import sys, os
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'src'))
import helper_funcs as my_funcs

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter

In [4]:
data = pd.read_csv('../data/USdata_est_campgrounds_zip_states_combined_cleaned.csv')
data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
data = data[~data['Description'].isna()]

In [6]:
# just try with two categories.. maybe the smaller categories are influencing the clusters too much
data_wild_est = data[data['Category'].isin(['Established Campground', 'Wild Camping'])]

In [7]:
data_wild_est['Category'].unique()

array(['Established Campground', 'Wild Camping'], dtype=object)

In [54]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10_000, max_df=0.9)
all_desc_tfidf_matrix = tfidf.fit_transform(data_wild_est['Description'])
k = 3
clust_names = ['roadsite spot', 'wild campground', 'est campground'] #['service station', 'RV park type', 'pull off camp', 'open camping', 'informal']
wild_est_features = tfidf.get_feature_names()
wild_est_data_cluster_cats, wild_est_cluster_feats = run_kmeans(all_desc_tfidf_matrix, data_wild_est, wild_est_features, k)
wild_est_data_cat_summary, wild_est_cat_df = get_cat_summary(wild_est_data_cluster_cats, wild_est_cluster_feats, clust_names)

In [57]:
wild_est_cluster_feats

{0: 'parking, place, night, quiet, area, lot, nice, camping, park, free, good, overnight, great, near, spot, street, big, just, view, river',
 1: 'road, spot, forest, spots, dirt, nice, just, service, good, small, river, right, camping, quiet, creek, site, camp, gravel, great, little',
 2: 'campground, sites, water, nice, showers, park, night, lake, toilets, site, rv, clean, free, tent, tables, pit, picnic, hot, camping, available'}

In [56]:
wild_est_data_cat_summary

Unnamed: 0,cluster,category,count,pct_total,top words,cluster name
0,0,Wild Camping,2560,73.18,"parking, place, night, quiet, area, lot, nice,...",roadsite spot
2,1,Wild Camping,2428,89.86,"road, spot, forest, spots, dirt, nice, just, s...",wild campground
4,2,Established Campground,2527,94.89,"campground, sites, water, nice, showers, park,...",est campground


In [58]:
#wild_est_data_cat_summary.to_csv('../data/NLPall_data_cat_summary_FINAL.csv')