In [1]:
# generic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# notebook settings
%config IPCompleter.greedy=True
%load_ext autoreload
%autoreload 2 
# precision and plot settings
num_precision = 3
np.set_printoptions(precision=num_precision, suppress=True)
pd.set_option('display.float_format', lambda x: f'{x:,.{num_precision}f}')
pd.set_option("display.precision", num_precision)
pd.set_option('display.max_columns', None)

plt.style.use('tableau-colorblind10')
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 16
plt.rcParams['legend.fontsize'] = 'large'
plt.rcParams['figure.titlesize'] = 'medium'
plt.rcParams['lines.linewidth'] = 2

In [2]:
# setup dir and import helper functions
import sys, os
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'src'))
import helper_funcs as my_funcs

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter

In [4]:
# use the data that I have aligned with the images so it will be easier to update labels
# the images are here: unique_wild_est_for_aligned_model
data = pd.read_csv('../data/image_file_df_final_with_df_NO_DUPS.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
no_desc = data[data['Description'].isna()]

In [6]:
no_desc['filename'].to_list()

['satimg_WA_734_Established Campground_17_48.5126_-122.61267']

In [7]:
# remove one file without description
!rm /Users/pault/Desktop/github/CampsitePredict/data/symlink_data/unique_wild_est_for_aligned_model/Established\ Campground/satimg_WA_734_Established\ Campground_17_48.5126_-122.61267.png

rm: /Users/pault/Desktop/github/CampsitePredict/data/symlink_data/unique_wild_est_for_aligned_model/Established Campground/satimg_WA_734_Established Campground_17_48.5126_-122.61267.png: No such file or directory


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7835 entries, 0 to 7834
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   filename                   7835 non-null   object 
 1   lat_from_file              7835 non-null   float64
 2   long_from_file             7835 non-null   float64
 3   order                      7835 non-null   int64  
 4   latlong_test               7835 non-null   object 
 5   Location                   1642 non-null   object 
 6   Name                       7835 non-null   object 
 7   Category                   7835 non-null   object 
 8   Description                7834 non-null   object 
 9   Latitude                   7835 non-null   float64
 10  Longitude                  7835 non-null   float64
 11  Altitude                   7649 non-null   float64
 12  Date verified              7835 non-null   object 
 13  Open                       7835 non-null   objec

In [9]:
data = data[~data['Description'].isna()]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7834 entries, 0 to 7834
Data columns (total 47 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   filename                   7834 non-null   object 
 1   lat_from_file              7834 non-null   float64
 2   long_from_file             7834 non-null   float64
 3   order                      7834 non-null   int64  
 4   latlong_test               7834 non-null   object 
 5   Location                   1641 non-null   object 
 6   Name                       7834 non-null   object 
 7   Category                   7834 non-null   object 
 8   Description                7834 non-null   object 
 9   Latitude                   7834 non-null   float64
 10  Longitude                  7834 non-null   float64
 11  Altitude                   7649 non-null   float64
 12  Date verified              7834 non-null   object 
 13  Open                       7834 non-null   objec

In [11]:
data['Category'].unique()

array(['Wild Camping', 'Established Campground'], dtype=object)

In [12]:
# add words to stopwords
from sklearn.feature_extraction import text 
my_additional_stop_words = ['free', 'nice', 'good', 'spot', 'area']
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [13]:
tfidf = TfidfVectorizer(stop_words=stop_words, max_features=10_000, max_df=0.9)
all_desc_tfidf_matrix = tfidf.fit_transform(data['Description'])
k = 3
wild_est_features = tfidf.get_feature_names()
wild_est_data_cluster_cats, wild_est_cluster_feats, site_clusters = my_funcs.run_kmeans(all_desc_tfidf_matrix, data, wild_est_features, k, seed=42, provide_clusters=True)

In [14]:
wild_est_cluster_feats

{0: 'road, forest, dirt, spots, just, quiet, service, creek, river, right, small, camping, gravel, pull, little, site, camp, view, national, great',
 1: 'campground, sites, water, showers, park, night, toilets, lake, rv, site, clean, tent, tables, pit, hot, picnic, available, 20, river, camping',
 2: 'parking, place, quiet, night, camping, lot, park, great, overnight, river, big, near, street, beautiful, view, large, highway, just, lake, lots'}

In [15]:
clust_names = ['wild_camp', 'est_camp', 'road_spot'] #['service station', 'RV park type', 'pull off camp', 'open camping', 'informal']
wild_est_data_cat_summary, wild_est_cat_df = my_funcs.get_cat_summary(wild_est_data_cluster_cats, wild_est_cluster_feats, clust_names)

In [16]:
wild_est_data_cat_summary

Unnamed: 0,cluster,category,count,pct_total,top words,cluster name
0,0,Wild Camping,1948,88.38,"road, forest, dirt, spots, just, quiet, servic...",wild_camp
2,1,Established Campground,2301,94.73,"campground, sites, water, showers, park, night...",est_camp
4,2,Wild Camping,2335,72.95,"parking, place, quiet, night, camping, lot, pa...",road_spot


In [17]:
wild_est_cat_df

Unnamed: 0,cluster,category,count,pct_total
0,0,Wild Camping,1948,88.38
1,0,Established Campground,256,11.62
2,1,Established Campground,2301,94.73
3,1,Wild Camping,128,5.27
4,2,Wild Camping,2335,72.95
5,2,Established Campground,866,27.05


In [18]:
data.groupby(by='Category').agg('count')['Name']

Category
Established Campground    3423
Wild Camping              4411
Name: Name, dtype: int64

In [19]:
# assign each site (row) with the cluster label
data_clusters = data.copy()
data_clusters['cluster_label'] = site_clusters

In [20]:
for i, label in enumerate(clust_names):
    data_clusters['cluster_label'].replace(i, label, inplace=True)

In [21]:
data_clusters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7834 entries, 0 to 7834
Data columns (total 48 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   filename                   7834 non-null   object 
 1   lat_from_file              7834 non-null   float64
 2   long_from_file             7834 non-null   float64
 3   order                      7834 non-null   int64  
 4   latlong_test               7834 non-null   object 
 5   Location                   1641 non-null   object 
 6   Name                       7834 non-null   object 
 7   Category                   7834 non-null   object 
 8   Description                7834 non-null   object 
 9   Latitude                   7834 non-null   float64
 10  Longitude                  7834 non-null   float64
 11  Altitude                   7649 non-null   float64
 12  Date verified              7834 non-null   object 
 13  Open                       7834 non-null   objec

In [22]:
data_clusters['filename'] = data_clusters['filename'] + '.png'

In [33]:
data_clusters.iloc[0,0]

'satimg_CO__352_Wild Camping_17_38.98102_-107.32651.png'

In [24]:
# create symlink dirs of these images with the clusters as labels

In [28]:
directory = '/Users/pault/Desktop/github/CampsitePredict/data/sat_images/'
destination = '/Users/pault/Desktop/github/CampsitePredict/data/symlink_data/'
dest_dir_name = 'wild_est_NLP_labels'
class_dirs = ['Wild Camping', 'Established Campground']

In [None]:
if 'satimg_CO__352_Wild Camping_17_38.98102_-107.32651.png' in filenames_dict.haskey():
    print('yes')

In [69]:
def make_symlinks_file_lists_new_dirs(directory, destination, dest_dir_name, class_dirs, df):
    counter = 0
    filedict = {}
    # make list of files with name and path in dict
    for root_path, dirs, files in os.walk(directory, followlinks=False):
        for file in files:
            if file.endswith(".png"):
                filedict[file] = str(os.path.join(root_path, file))
    # create symlink dir
    symlink_dir_path = os.path.join(destination + dest_dir_name)
#     print(symlink_dir_path)
    if not os.path.isdir(symlink_dir_path):
            os.makedirs(symlink_dir_path)
    # now go through files
    # get filenames and labels to copy
    filenames_dict = dict(zip(df['filename'], df['cluster_label']))
    test_list = []
    for file, filepath in filedict.items():
        # setup class directory name to check if it is a category we want to copy
#         parent = os.path.basename(os.path.dirname(os.path.dirname(filepath)))
#         print(parent)
        subdir = os.path.basename(os.path.dirname(filepath))
#         print(subdir)
#         fullparent = os.path.join(sobel_dir + os.sep + parent + os.sep + subdir)
        
        # only copy files if in directories we want
        if subdir in class_dirs:
#             print(type(file))
            
            if file in filenames_dict:
                test_list.append(file)
    #             print(subdir)
                # create symlink
#                 print(filepath)
                # use the filename / label dictionary to determine what the new directory will be called
                destination_filepath = os.path.join(destination + dest_dir_name + os.sep + filenames_dict[file] + os.sep + file)
#                 print(destination_filepath)
                # create class dir if it doesn't exist
                destination_class_dir = os.path.join(destination + dest_dir_name + os.sep + filenames_dict[file] + os.sep)
    #             print(destination_class_dir)
                if not os.path.isdir(destination_class_dir):
                    os.makedirs(destination_class_dir)
                # create destination filepath
                os.symlink(filepath, destination_filepath, target_is_directory=False)
                # ln -s ~/source/* wild_est_after_exc/Established\ Campground/
                counter += 1
    print(f'{counter} files were created as symlinks.')
    return test_list

In [70]:
file_dict = make_symlinks_file_lists_new_dirs(directory, destination, dest_dir_name, class_dirs, data_clusters)

7834 files were created as symlinks.


In [71]:
# while we are doing this, we might as well try wild vs not wild.. so make binary labels
data_clusters_binary = data_clusters.copy()

In [73]:
data_clusters_binary['cluster_label'].replace('road_spot', 'est_camp', inplace=True)

In [76]:
directory = '/Users/pault/Desktop/github/CampsitePredict/data/sat_images/'
destination = '/Users/pault/Desktop/github/CampsitePredict/data/symlink_data/'
dest_dir_name = 'wild_est_NLP_binary'
class_dirs = ['Wild Camping', 'Established Campground']

In [77]:
file_dict = make_symlinks_file_lists_new_dirs(directory, destination, dest_dir_name, class_dirs, data_clusters_binary)

7834 files were created as symlinks.
