In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('seaborn-white')
# colorblind safe
plt.style.use('seaborn-colorblind')
plt.style.use('tableau-colorblind10')

# width and precision for f strings
width = 10
precision = 4

# default sizes for plots
# https://matplotlib.org/3.3.0/tutorials/introductory/customizing.html#customizing-with-matplotlibrc-files
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 16
plt.rcParams['legend.fontsize'] = 'large'
plt.rcParams['figure.titlesize'] = 'medium'
plt.rcParams['lines.linewidth'] = 2

# other settings
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option("display.precision", 3)
np.set_printoptions(precision=3, suppress=True)
%load_ext autoreload
%autoreload 2
pd.set_option('display.max_columns', None)
%config IPCompleter.greedy=True

In [2]:
# setup dir and import helper functions
import sys, os
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'src'))
import helper_funcs as my_funcs
import re

In [3]:
names = ['CA', 'AZ', 'CO', 'OR', 'UT', 'WA', 'TX', 'FL', 'AK', 'MT', 'NM', 'ID', 'WY', 'NV', 'NY', 'WV', 'MD', 'VA', 'NC', 'TN', 'MI', 'SD', 'LA', 'GA', 'ME', 'PA', 'NE', 'MN', 'AR', 'MS', 'OK', 'AL', 'KS', 'SC', 'WI']

In [4]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [5]:
names = states

In [6]:
len(names)

51

In [7]:
# filter US data for this - this is what I'm calling "all US data"
# forgot that I only pulled closed from all 50 states
USdata = pd.read_csv('../data/USdata_all_zip_states_combined_cleaned_NEW.csv')

In [8]:
list(USdata['Category'].unique())

['Established Campground',
 'Wild Camping',
 'Informal Campsite',
 'Showers',
 'Water',
 'Short-term Parking',
 'Eco-Friendly']

In [9]:
USdata['State'].unique()

array(['CA', 'UT', 'AK', 'OR', 'WA', 'WV', 'MD', 'MT', 'AZ', 'SE', 'NV',
       'CO', 'NM', 'TX', 'TN', 'KY', 'FL', 'ME', 'NC', 'VA', 'IL', 'SD',
       'NE', 'OH', 'WY', 'AR', 'MI', 'ID', 'MS', 'OK', 'GA', 'AL', 'LA',
       'SC', 'NY', 'VT', 'KS', 'MO', 'RV', 'US', 'NW', 'PA', 'WI', 'MA',
       'MN', 'NJ', 'SW', 'ND', 'IN', 'HI', 'IA', 'AB', 'NH', 'YT', 'DE',
       'CT', 'BC', 'RI', 'N.L.', 'NB', nan, 'DC', 'ON', 'QC', 'S.L.P.',
       'Ver.', 'Nay.', 'NS', 'B.C.', 'Pue.'], dtype=object)

In [10]:
All_USdata = USdata[USdata['State'].isin(names)]

In [11]:
All_USdata['State'].unique()

array(['CA', 'UT', 'AK', 'OR', 'WA', 'WV', 'MD', 'MT', 'AZ', 'NV', 'CO',
       'NM', 'TX', 'TN', 'KY', 'FL', 'ME', 'NC', 'VA', 'IL', 'SD', 'NE',
       'OH', 'WY', 'AR', 'MI', 'ID', 'MS', 'OK', 'GA', 'AL', 'LA', 'SC',
       'NY', 'VT', 'KS', 'MO', 'PA', 'WI', 'MA', 'MN', 'NJ', 'ND', 'IN',
       'HI', 'IA', 'NH', 'DE', 'CT', 'RI', 'DC'], dtype=object)

In [12]:
All_USdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11786 entries, 0 to 12411
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Location                   2212 non-null   object 
 1   Name                       11786 non-null  object 
 2   Category                   11786 non-null  object 
 3   Description                11785 non-null  object 
 4   Latitude                   11786 non-null  float64
 5   Longitude                  11786 non-null  float64
 6   Altitude                   11490 non-null  float64
 7   Date verified              11786 non-null  object 
 8   Open                       11786 non-null  object 
 9   Electricity                11007 non-null  object 
 10  Wifi                       11007 non-null  object 
 11  Kitchen                    11007 non-null  object 
 12  Parking                    0 non-null      float64
 13  Restaurant                 11007 non-null  obj

In [13]:
All_USdata.head()

Unnamed: 0,Location,Name,Category,Description,Latitude,Longitude,Altitude,Date verified,Open,Electricity,Wifi,Kitchen,Parking,Restaurant,Showers,Water,Toilets,Big rig friendly,Tent friendly,Pet friendly,Sanitation dump station,Outdoor gear,Groceries,Artesian goods,Bakery,Rarity in this area,Repairs vehicles,Repairs motorcycles,Repairs bicycles,Sells parts,Recycles batteries,Recycles oil,Bio fuel,Electric vehicle charging,Composting sawdust,Recycling center,zip_code,State
0,"Borrego Salton Seaway, Borrego Springs, CA 920...",Arroyo Salado Camping,Established Campground,"Free with two vault toilets, nothing else exis...",33.2802,-116.1458,0.0,2020-02-16 14:20:45 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,CA
1,"Stateline Campground Rd, Kanab, UT 84741, USA",State Line Campground,Established Campground,Cute free BLM camping in near TH baths.,37.0013,-112.0356,0.0,2019-09-29 09:53:41 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,No,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,UT
2,"Glenn Hwy, Glennallen, AK 99588, USA",Tolsona River RV Park and Campground,Established Campground,An old standby for us. Multiple sites for lar...,62.0954,-145.9805,0.0,2016-05-27 11:38:27 UTC,Yes,Unknown,Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,,,,,,,,,,,,,,,,,AK
3,"Beverly Beach State Park, Newport, OR 97365, USA",Beverly Beach State Park,Established Campground,This is a great beach for a first night on the...,44.7288,-124.0555,0.0,2020-02-07 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR
4,"Cottell Ln, Coos Bay, OR 97420, USA",Sunset Bay State Park,Established Campground,"Another night, another campground. Full hooku...",43.3308,-124.3707,2.6656,2020-06-27 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR


In [14]:
# look at potential binary columns for wild & est first
All_USdata_est_wild = All_USdata[All_USdata['Category'].isin(['Established Campground','Wild Camping'])]

In [15]:
All_USdata_est_wild.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8325 entries, 0 to 12410
Data columns (total 38 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Location                   1720 non-null   object 
 1   Name                       8325 non-null   object 
 2   Category                   8325 non-null   object 
 3   Description                8324 non-null   object 
 4   Latitude                   8325 non-null   float64
 5   Longitude                  8325 non-null   float64
 6   Altitude                   8108 non-null   float64
 7   Date verified              8325 non-null   object 
 8   Open                       8325 non-null   object 
 9   Electricity                8325 non-null   object 
 10  Wifi                       8325 non-null   object 
 11  Kitchen                    8325 non-null   object 
 12  Parking                    0 non-null      float64
 13  Restaurant                 8325 non-null   obje

In [16]:
All_USdata_est_wild['Toilets'].unique()

array(['Pit Toilets', 'Unknown', 'Running Water', 'No', 'Yes'],
      dtype=object)

In [17]:
All_USdata_est_wild.loc[:,['Category', 'Description']].groupby(['Category']).agg('count')

Unnamed: 0_level_0,Description
Category,Unnamed: 1_level_1
Established Campground,3713
Wild Camping,4611


In [18]:
cat_counts = All_USdata_est_wild.loc[:,['Category', 'Description']].groupby(['Category']).agg(
    description_pct = ('Description','count'))
cat_counts
total = cat_counts['description_pct'].sum()
description_pct = cat_counts.groupby('description_pct').apply(lambda x: 100 * x / total)
description_pct

Unnamed: 0_level_0,description_pct
Category,Unnamed: 1_level_1
Established Campground,44.606
Wild Camping,55.394


In [19]:
All_USdata_est_wild.loc[:,['Category', 'Description', 'Toilets']].groupby(['Toilets', 'Category']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Description
Toilets,Category,Unnamed: 2_level_1
No,Established Campground,255
No,Wild Camping,3960
Pit Toilets,Established Campground,1317
Pit Toilets,Wild Camping,394
Running Water,Established Campground,2002
Running Water,Wild Camping,165
Unknown,Established Campground,88
Unknown,Wild Camping,80
Yes,Established Campground,51
Yes,Wild Camping,12


In [20]:
All_USdata_est_wild.loc[:,['Category', 'Description', 'Electricity']].groupby(['Electricity', 'Category']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Description
Electricity,Category,Unnamed: 2_level_1
No,Established Campground,1662
No,Wild Camping,4520
Unknown,Established Campground,104
Unknown,Wild Camping,47
Yes,Established Campground,50
Yes - At Sites,Established Campground,1713
Yes - At Sites,Wild Camping,21
Yes - Not at Sites,Established Campground,184
Yes - Not at Sites,Wild Camping,23


In [21]:
All_USdata_est_wild.loc[:,['Category', 'Description', 'Water']].groupby(['Water', 'Category']).agg('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Description
Water,Category,Unnamed: 2_level_1
Natural Source,Established Campground,337
Natural Source,Wild Camping,792
No,Established Campground,617
No,Wild Camping,3491
Non-Potable,Established Campground,161
Non-Potable,Wild Camping,64
Potable,Established Campground,2125
Potable,Wild Camping,71
Unknown,Established Campground,166
Unknown,Wild Camping,118


In [22]:
# for water, toilets, electricity, make binary cols for each - these seem interesting and don't split with wild/est
All_USdata_est_wild['water_binary'] = 0
yes = ['Potable', 'Yes']
# df['color'] = np.where(df['Set']=='Z', 'green', 'red')
All_USdata_est_wild['water_binary'][All_USdata_est_wild['Water'].isin(yes)] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)


In [23]:
All_USdata_est_wild.head()

Unnamed: 0,Location,Name,Category,Description,Latitude,Longitude,Altitude,Date verified,Open,Electricity,Wifi,Kitchen,Parking,Restaurant,Showers,Water,Toilets,Big rig friendly,Tent friendly,Pet friendly,Sanitation dump station,Outdoor gear,Groceries,Artesian goods,Bakery,Rarity in this area,Repairs vehicles,Repairs motorcycles,Repairs bicycles,Sells parts,Recycles batteries,Recycles oil,Bio fuel,Electric vehicle charging,Composting sawdust,Recycling center,zip_code,State,water_binary
0,"Borrego Salton Seaway, Borrego Springs, CA 920...",Arroyo Salado Camping,Established Campground,"Free with two vault toilets, nothing else exis...",33.2802,-116.1458,0.0,2020-02-16 14:20:45 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,CA,0
1,"Stateline Campground Rd, Kanab, UT 84741, USA",State Line Campground,Established Campground,Cute free BLM camping in near TH baths.,37.0013,-112.0356,0.0,2019-09-29 09:53:41 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,No,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,UT,0
2,"Glenn Hwy, Glennallen, AK 99588, USA",Tolsona River RV Park and Campground,Established Campground,An old standby for us. Multiple sites for lar...,62.0954,-145.9805,0.0,2016-05-27 11:38:27 UTC,Yes,Unknown,Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,,,,,,,,,,,,,,,,,AK,0
3,"Beverly Beach State Park, Newport, OR 97365, USA",Beverly Beach State Park,Established Campground,This is a great beach for a first night on the...,44.7288,-124.0555,0.0,2020-02-07 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR,1
4,"Cottell Ln, Coos Bay, OR 97420, USA",Sunset Bay State Park,Established Campground,"Another night, another campground. Full hooku...",43.3308,-124.3707,2.6656,2020-06-27 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR,1


In [24]:
All_USdata_est_wild['toilets_binary'] = 0
yes = ['Pit Toilets', 'Yes', 'Running Water']
# df['color'] = np.where(df['Set']=='Z', 'green', 'red')
All_USdata_est_wild['toilets_binary'][All_USdata_est_wild['Toilets'].isin(yes)] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)


In [25]:
All_USdata_est_wild.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8325 entries, 0 to 12410
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Location                   1720 non-null   object 
 1   Name                       8325 non-null   object 
 2   Category                   8325 non-null   object 
 3   Description                8324 non-null   object 
 4   Latitude                   8325 non-null   float64
 5   Longitude                  8325 non-null   float64
 6   Altitude                   8108 non-null   float64
 7   Date verified              8325 non-null   object 
 8   Open                       8325 non-null   object 
 9   Electricity                8325 non-null   object 
 10  Wifi                       8325 non-null   object 
 11  Kitchen                    8325 non-null   object 
 12  Parking                    0 non-null      float64
 13  Restaurant                 8325 non-null   obje

In [26]:
All_USdata_est_wild

Unnamed: 0,Location,Name,Category,Description,Latitude,Longitude,Altitude,Date verified,Open,Electricity,Wifi,Kitchen,Parking,Restaurant,Showers,Water,Toilets,Big rig friendly,Tent friendly,Pet friendly,Sanitation dump station,Outdoor gear,Groceries,Artesian goods,Bakery,Rarity in this area,Repairs vehicles,Repairs motorcycles,Repairs bicycles,Sells parts,Recycles batteries,Recycles oil,Bio fuel,Electric vehicle charging,Composting sawdust,Recycling center,zip_code,State,water_binary,toilets_binary
0,"Borrego Salton Seaway, Borrego Springs, CA 920...",Arroyo Salado Camping,Established Campground,"Free with two vault toilets, nothing else exis...",33.2802,-116.1458,0.0000,2020-02-16 14:20:45 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,CA,0,1
1,"Stateline Campground Rd, Kanab, UT 84741, USA",State Line Campground,Established Campground,Cute free BLM camping in near TH baths.,37.0013,-112.0356,0.0000,2019-09-29 09:53:41 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,No,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,UT,0,1
2,"Glenn Hwy, Glennallen, AK 99588, USA",Tolsona River RV Park and Campground,Established Campground,An old standby for us. Multiple sites for lar...,62.0954,-145.9805,0.0000,2016-05-27 11:38:27 UTC,Yes,Unknown,Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,,,,,,,,,,,,,,,,,AK,0,0
3,"Beverly Beach State Park, Newport, OR 97365, USA",Beverly Beach State Park,Established Campground,This is a great beach for a first night on the...,44.7288,-124.0555,0.0000,2020-02-07 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR,1,1
4,"Cottell Ln, Coos Bay, OR 97420, USA",Sunset Bay State Park,Established Campground,"Another night, another campground. Full hooku...",43.3308,-124.3707,2.6656,2020-06-27 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12403,"Unnamed Road, Fallon, NV 89406, USA",hidden cave BLM site,Wild Camping,Parking lot for hidden cave. about 1 mile from...,39.4090,-118.6291,1207.0000,2020-05-21 00:00:00 UTC,Yes,No,Yes - Fast,No,,No,No,No,Pit Toilets,Yes,Unknown,Yes,,,,,,,,,,,,,,,,,,NV,0,1
12404,"2560 E Lucky Ln, Flagstaff, AZ 86004, USA",Lucky Lane street parking,Wild Camping,Flagstaff is extremely unfriendly to RVs and t...,35.1957,-111.6174,2091.3766,2020-08-08 00:00:00 UTC,Yes,No,Yes - Unknown,No,,Yes,No,Potable,No,Yes,No,Yes,,,,,,,,,,,,,,,,,,AZ,1,0
12408,"Mexican Hat Rock Rd, Mexican Hat, UT 84531, USA",Between San Juan river and Mexican Hat,Wild Camping,"A lot near the road along San Juan river, real...",37.1757,-109.8456,0.0000,2020-05-20 00:00:00 UTC,Yes,Unknown,Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,,,,,,,,,,,,,,,,,,UT,0,0
12409,"818-898 E Locust St, San Antonio, TX 78212, USA",Neighborhood Parking,Wild Camping,Super quiet and low profile place to park for ...,29.4439,-98.4824,0.0000,2020-03-12 00:00:00 UTC,Yes,No,No,No,,No,No,No,No,Unknown,No,Yes,,,,,,,,,,,,,,,,,,TX,0,0


In [27]:
# # make lat long col to match with image filenames {prefix}_{i}_{cat}_{zoomlevel}_{lat}_{long}.png'
# All_USdata_est_wild['filename_end'] = All_USdata_est_wild['Category'].astype(str) + '_17_' + \
#                                     All_USdata_est_wild['Latitude'].astype(str) + '_' + \
#                                     All_USdata_est_wild['Longitude'].astype(str) + '.png'

In [28]:
# pd.set_option('max_colwidth', None)
# All_USdata_est_wild['filename_end'].head()

In [30]:
directory = '/Users/pault/Desktop/github/CampsitePredict/data/symlink_data/only_unaugmented'

In [31]:
# get list of images in os.walk order
filelist = []
for root_path, dirs, files in os.walk(directory, followlinks=False):
    for file in files:
        if file.endswith(".png"):
            # only keep original files not augmented
            if not re.search('rot[0-9]{2,3}.png$', file):
                # parse out part of filename
                filelist.append(file[:-4])

In [32]:
filelist[:10]

['satimg_CO__352_Wild Camping_17_38.98102_-107.32651',
 'satimg_ID_7863_Wild Camping_17_43.149667_-111.052531',
 'satimg_TX_6214_Wild Camping_17_35.2375_-102.83496099999999',
 'satimg_CO__216_Wild Camping_17_39.337122_-107.660378',
 'satimg_AZ_6033_Wild Camping_17_34.169239000000005_-110.794278',
 'satimg_MI_6491_Wild Camping_17_46.76277_-85.02438000000002',
 'satimg_UT_2806_Wild Camping_17_37.84661_-111.428193',
 'satimg_AK_12112_Wild Camping_17_63.887170999999995_-149.348656',
 'satimg_OR_6000_Wild Camping_17_44.413897_-120.495699',
 'satimg_PA_2971_Wild Camping_17_41.430395_-78.883376']

In [33]:
# make index to reorder the df the same
image_file_df = pd.DataFrame(filelist, columns =['filename']) 

In [34]:
# image_file_df.to_csv('../data/example_to_split.csv')

In [71]:
def func(f, n=5):
    result =['.'.join([j[:n] for j in i.split('.')]) for i in f['filename'].split('_')[-2:]]
    return {'lat_from_file':result[0], 'long_from_file':result[1]}

In [72]:
latlongs = image_file_df[['filename']].apply(func, axis = 1, result_type = 'expand')

In [73]:
image_file_df = pd.concat([image_file_df, latlongs], axis=1)

In [38]:
# split = image_file_df['filename'].str.rsplit('_', 2, expand=True)

In [39]:
# split = split.rename(columns={0: "file_start", 1: "lat", 2: "long"})

In [40]:
# image_file_df['lat_from_file'] = split.iloc[:,0].astype('str')
# image_file_df['long_from_file'] = split.iloc[:,1].astype('str')
# image_file_df['lat_from_file']
# image_file_df['long_from_file'] float( '%.3f'%(x) )df
# image_file_df['lat_from_file'].apply(lambda x: '%.3f'%(x))

In [74]:
image_file_df['order'] = image_file_df.index

In [42]:
image_file_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7875 entries, 0 to 7874
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   filename        7875 non-null   object
 1   lat_from_file   7875 non-null   object
 2   long_from_file  7875 non-null   object
 3   order           7875 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 246.2+ KB


In [43]:
def func2(col, n=6):
    result = ['.'.join([j[:n] for j in x.split('.')]) for x in col.astype('str')]
#     ['.'.join([j[:n] for j in i.split('.')]) for i in f['filename'].split('_')[-2:]]
    return result

In [44]:
# [x.split('.') for x in All_USdata_est_wild['Latitude'].astype('str')]

In [45]:
# ['.'.join([j[:6] for j in x.split('.')]) for x in All_USdata_est_wild['Latitude'].astype('str')]

In [46]:
# All_USdata_est_wild['Latitude'].split('.')
# ['.'.join([All_USdata_est_wild[:n] for j in i.split('.')])

In [47]:
# All_USdata_est_wild['Latitude'].apply(func2)

In [62]:
All_USdata_est_wild['lat_from_df'] = ['.'.join([j[:5] for j in x.split('.')]) for x in All_USdata_est_wild['Latitude'].astype('str')]
All_USdata_est_wild['long_from_df'] = ['.'.join([j[:5] for j in x.split('.')]) for x in All_USdata_est_wild['Longitude'].astype('str')]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [63]:
All_USdata_est_wild.head()

Unnamed: 0,Location,Name,Category,Description,Latitude,Longitude,Altitude,Date verified,Open,Electricity,Wifi,Kitchen,Parking,Restaurant,Showers,Water,Toilets,Big rig friendly,Tent friendly,Pet friendly,Sanitation dump station,Outdoor gear,Groceries,Artesian goods,Bakery,Rarity in this area,Repairs vehicles,Repairs motorcycles,Repairs bicycles,Sells parts,Recycles batteries,Recycles oil,Bio fuel,Electric vehicle charging,Composting sawdust,Recycling center,zip_code,State,water_binary,toilets_binary,lat_from_df,long_from_df
0,"Borrego Salton Seaway, Borrego Springs, CA 920...",Arroyo Salado Camping,Established Campground,"Free with two vault toilets, nothing else exis...",33.2802,-116.1458,0.0,2020-02-16 14:20:45 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,CA,0,1,33.28018,-116.14578
1,"Stateline Campground Rd, Kanab, UT 84741, USA",State Line Campground,Established Campground,Cute free BLM camping in near TH baths.,37.0013,-112.0356,0.0,2019-09-29 09:53:41 UTC,Yes,No,No,No,,No,No,No,Pit Toilets,No,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,UT,0,1,37.00129,-112.03558
2,"Glenn Hwy, Glennallen, AK 99588, USA",Tolsona River RV Park and Campground,Established Campground,An old standby for us. Multiple sites for lar...,62.0954,-145.9805,0.0,2016-05-27 11:38:27 UTC,Yes,Unknown,Unknown,Unknown,,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,,,,,,,,,,,,,,,,,AK,0,0,62.09536,-145.98049
3,"Beverly Beach State Park, Newport, OR 97365, USA",Beverly Beach State Park,Established Campground,This is a great beach for a first night on the...,44.7288,-124.0555,0.0,2020-02-07 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR,1,1,44.7288,-124.05551
4,"Cottell Ln, Coos Bay, OR 97420, USA",Sunset Bay State Park,Established Campground,"Another night, another campground. Full hooku...",43.3308,-124.3707,2.6656,2020-06-27 00:00:00 UTC,Yes,Yes - At Sites,No,No,,No,Hot,Potable,Running Water,Yes,Yes,Yes,Unknown,,,,,,,,,,,,,,,,,OR,1,1,43.33079,-124.37072


In [80]:
image_file_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7875 entries, 0 to 7874
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   filename        7875 non-null   object
 1   lat_from_file   7875 non-null   object
 2   long_from_file  7875 non-null   object
 3   order           7875 non-null   int64 
 4   lat_from_file   7875 non-null   object
 5   long_from_file  7875 non-null   object
dtypes: int64(1), object(5)
memory usage: 369.3+ KB


In [93]:
type(image_file_df.iloc[0,:][4])
# image_file_df['lat_from_file'].unique()

str

In [98]:
lats = list(image_file_df['lat_from_file'].values)
lats = list(image_file_df['lat_from_file'].to_list)

In [103]:
lats[0]

array(['38.98102', '38.98102'], dtype=object)

In [84]:
image_file_df.groupby('lat_from_file').agg({'filename':'count'})

ValueError: Grouper for 'lat_from_file' not 1-dimensional

In [66]:
All_USdata_est_wild.loc[All_USdata_est_wild['Latitude']==38.46495] 

Unnamed: 0,Location,Name,Category,Description,Latitude,Longitude,Altitude,Date verified,Open,Electricity,Wifi,Kitchen,Parking,Restaurant,Showers,Water,Toilets,Big rig friendly,Tent friendly,Pet friendly,Sanitation dump station,Outdoor gear,Groceries,Artesian goods,Bakery,Rarity in this area,Repairs vehicles,Repairs motorcycles,Repairs bicycles,Sells parts,Recycles batteries,Recycles oil,Bio fuel,Electric vehicle charging,Composting sawdust,Recycling center,zip_code,State,water_binary,toilets_binary,lat_from_df,long_from_df
9479,,I-5/ fishing place,Wild Camping,It’s a large area big enough to park for the n...,38.465,-121.5031,0.0,2020-03-08 00:00:00 UTC,Yes,No,No,No,,Unknown,No,No,No,No,No,Yes,,,,,,,,,,,,,,,,,95832,CA,0,0,38.46495,-121.50308


In [61]:
image_file_df.iloc[12]

filename          satimg_CA_5739_Wild Camping_17_38.46495_-121.5...
lat_from_file                                              38.46495
long_from_file                                           -121.50308
order                                                            12
Name: 12, dtype: object

In [75]:
# join with df of data, keep all images, hopefully they have data in the df
image_file_df_final_with_df = image_file_df.merge(All_USdata_est_wild, how='left', 
                                                  left_on=['lat_from_file', 'long_from_file'],
                                                 right_on=['lat_from_df', 'long_from_df'])

ValueError: The column label 'lat_from_file' is not unique.

In [69]:
image_file_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7875 entries, 0 to 7874
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   filename        7875 non-null   object
 1   lat_from_file   7875 non-null   object
 2   long_from_file  7875 non-null   object
 3   order           7875 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 246.2+ KB


In [70]:
image_file_df_final_with_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7887 entries, 0 to 7886
Data columns (total 46 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   filename                   7887 non-null   object 
 1   lat_from_file              7887 non-null   object 
 2   long_from_file             7887 non-null   object 
 3   order                      7887 non-null   int64  
 4   Location                   291 non-null    object 
 5   Name                       806 non-null    object 
 6   Category                   806 non-null    object 
 7   Description                805 non-null    object 
 8   Latitude                   806 non-null    float64
 9   Longitude                  806 non-null    float64
 10  Altitude                   699 non-null    float64
 11  Date verified              806 non-null    object 
 12  Open                       806 non-null    object 
 13  Electricity                806 non-null    objec

In [452]:
image_file_df_final_with_df.to_csv('../data/test_image_file_df_final_with_df.csv')