The purpose of this notebook is to answer the questions put forth here:

https://docs.google.com/document/d/1Tu8-XHeOP9LHn6BmkOEHk__p1JaFNdb6jCw2UQc-vA8/edit

- Mark Halverson
- February 26, 2012

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from survey_utils.change_column_names import change_column_names

In [None]:
df = pd.read_csv("data/BCMT Membership Survey Anonymized Responses.csv")

In [None]:
df = change_column_names(df)

In [None]:
# print out the column names
df.columns

# Apply some string operations to fill gaps and ensure consistency in letter cases

In [None]:
# lower case some of the responses
df["activities"] = df["activities"].str.lower()
df["primary activity"] = df["primary activity"].str.lower()
df["membership reason"] = df["membership reason"].str.lower()
df["bcmt.org use reason"] = df["bcmt.org use reason"].str.lower()
df["mobile apps used"] = df["mobile apps used"].str.lower()

In [None]:
# gap fill
print(f'Website use reason has {df["bcmt.org use reason"].isna().sum()} null values')
print(f'Mobile apps used has {df["mobile apps used"].isna().sum()} null values')

df["bcmt.org use reason"].fillna("no answer provided", inplace=True)
df["mobile apps used"].fillna("no answer provided", inplace=True)

In [None]:
# fix up some compound words
df["membership reason"] = df["membership reason"].str.replace("camp site", "campsite", regex=False)
df["bcmt.org use reason"] = df["bcmt.org use reason"].str.replace("kajak", "kayak", regex=False)
df["mobile apps used"] = df["mobile apps used"].str.replace("garmon", "garmin", regex=False)

# Now the analysis

__Create a mask to pick out members__

In [None]:
is_member = ~df["membership duration"].str.contains("not")
print(f"{sum(is_member)} respondents report having a BCMT membership")

In [None]:
# subset the data for membership
df_members = df.loc[is_member,:]
df_members.shape

In [None]:
print("for how long have the members had a membership?")
df_members["membership duration"].value_counts()

__How often do the words "support" and "map" appear in the responses for why people purchased a BCMT membership?__

In [None]:
# print out 10 random reasons
df_members["membership reason"].sample(10).to_list()

In [None]:
# count the number of times "map" occurs in the responses.
# also create a flag that selects responses containing map
n_map = df_members["membership reason"].str.count("map")
has_map = n_map >= 1

In [None]:
has_map.value_counts()

In [None]:
df_members.loc[has_map,"membership reason"].to_list()

In [None]:
n_support = df_members["membership reason"].str.count("support")
has_support = n_support >= 1

In [None]:
df_members.loc[has_support,"membership reason"].sample(25).to_list()

In [None]:
# a few basic stats
print(f"Percentage of members mentioning \"map\": {np.round(100*sum(has_map)/df_members.shape[0],1)}%")
print(f"Percentage of members mentioning \"support\": {np.round(100*sum(has_support)/df_members.shape[0],1)}%")

__How often do the words map, resources, information, trip planning, ideas appear in the responses for why people use BCMT.org?__

In [None]:
df.columns

In [None]:
print("Example of reasons why respondents used the BCMT website")
df["bcmt.org use reason"].sample(20).to_list()

__Create a series of masks representing which responses contain words of interest__

In [None]:
strs = ["map","resource","info","plan","idea"]

uses = pd.Series(index=strs, dtype=float)
uses.name = "percent_responses"
for str in strs:
    has_str = df["bcmt.org use reason"].str.contains(str, case=False)
    uses[str] = 100*has_str.sum()/has_str.shape[0]
    print(f'Percentage of bcmt.org uses mentioning "{str.title()}": {np.round(100*has_str.sum()/has_str.shape[0],1)}%')

In [None]:
uses.sort_values(ascending=True).plot.barh();
plt.title("Reasons cited for using bcmt.org")
plt.xlabel("Percentage of responses containing string");

__How often do the words Google map, Windy, Navionics, Aquamaps, Tides, Current atlas, Gaia, Strava, Predict Wind, Garmin, and "No" appear in the responses for which mobile apps people use?__

In [None]:
df["mobile apps used"].to_list()

In [None]:
strs = ["google","windy","navionics","aquamaps","tides","atlas","gaia","strava","predict wind","garmin","no","no answer provided"]

uses = pd.Series(index=strs, dtype=float)
uses.name = "percent_responses"
for str in strs:
    has_str = df["mobile apps used"].str.contains(str, case=False)
    uses[str] = 100*has_str.sum()/has_str.shape[0]
    print(f'Percentage of responses containing "{str.title()}": {np.round(100*has_str.sum()/has_str.shape[0],1)}%')

In [None]:
# "title-ize" the app names
uses.index = uses.index.str.title()

In [None]:
uses.sort_values(ascending=True).plot.barh();
plt.title("App names cited by survey respondents")
plt.xlabel("Percentage of responses containing string");