The purpose of this notebook is to answer the questions put forth here:

https://docs.google.com/document/d/1Tu8-XHeOP9LHn6BmkOEHk__p1JaFNdb6jCw2UQc-vA8/edit

- Mark Halverson
- Last updated: March 09, 2021

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from survey_utils.change_column_names import change_column_names

In [None]:
df = pd.read_csv("data/BCMT Membership Survey Anonymized Responses.csv")

In [None]:
df = change_column_names(df)

In [None]:
# print out the column names
#df.columns

# Apply some string operations to fill gaps and ensure consistency in letter cases

In [None]:
# lower case some of the responses
df["activities"] = df["activities"].str.lower()
df["primary activity"] = df["primary activity"].str.lower()
df["membership reason"] = df["membership reason"].str.lower()
df["bcmt.org use reason"] = df["bcmt.org use reason"].str.lower()
df["mobile apps used"] = df["mobile apps used"].str.lower()

In [None]:
# gap fill
print(f'Website use reason has {df["bcmt.org use reason"].isna().sum()} null values')
print(f'Mobile apps used has {df["mobile apps used"].isna().sum()} null values')
print(f'Membership reason has {df["membership reason"].isna().sum()} null values')

df["bcmt.org use reason"].fillna("no answer provided", inplace=True)
df["mobile apps used"].fillna("no answer provided", inplace=True)
df["membership reason"].fillna("no answer provided", inplace=True)

In [None]:
# fix up some compound words
df["membership reason"] = df["membership reason"].str.replace("camp site", "campsite", regex=False)
df["bcmt.org use reason"] = df["bcmt.org use reason"].str.replace("kajak", "kayak", regex=False)
df["mobile apps used"] = df["mobile apps used"].str.replace("garmon", "garmin", regex=False)

# Now the analysis

__Create a mask to pick out members__

In [None]:
is_member = ~df["membership duration"].str.contains("not")
print(f"{sum(is_member)} respondents report having a BCMT membership")

In [None]:
# subset the data for membership
df_members = df.loc[is_member,:]
df_members.shape

In [None]:
print("for how long have the members had a membership?")
df_members["membership duration"].value_counts()

__How often do the words "support" and "map" appear in the responses for why people purchased a BCMT membership?__

In [None]:
# print out 10 random reasons
#df_members["membership reason"].sample(10).to_list()

In [None]:
# count the number of times "map" occurs in the responses.
# also create a flag that selects responses containing map
n_map = df_members["membership reason"].str.count("map")
has_map = n_map >= 1

In [None]:
has_map.value_counts()

In [None]:
#df_members.loc[has_map,"membership reason"].to_list()

In [None]:
n_support = df_members["membership reason"].str.count("support")
has_support = n_support >= 1

In [None]:
#df_members.loc[has_support,"membership reason"].sample(25).to_list()

In [None]:
# a few basic stats
print(f"Percentage of members mentioning \"map\": {np.round(100*sum(has_map)/df_members.shape[0],1)}%")
print(f"Percentage of members mentioning \"support\": {np.round(100*sum(has_support)/df_members.shape[0],1)}%")

__How often do the words map, resources, information, trip planning, ideas appear in the responses for why people use BCMT.org?__

In [None]:
#df.columns

In [None]:
#print("Example of reasons why respondents used the BCMT website")
#df["bcmt.org use reason"].sample(20).to_list()

__Create a series of masks representing which responses contain words of interest__

In [None]:
strs = ["map","resource","info","plan","idea"]

uses = pd.Series(index=strs, dtype=float)
uses.name = "percent_responses"
for str in strs:
    has_str = df["bcmt.org use reason"].str.contains(str, case=False)
    uses[str] = 100*has_str.sum()/has_str.shape[0]
    print(f'Percentage of bcmt.org uses mentioning "{str.title()}": {np.round(100*has_str.sum()/has_str.shape[0],1)}%')

In [None]:
uses.sort_values(ascending=True).plot.barh();
plt.title("Reasons cited for using bcmt.org")
plt.xlabel("Percentage of responses containing string");

__How often do the words Google map, Windy, Navionics, Aquamaps, Tides, Current atlas, Gaia, Strava, Predict Wind, Garmin, and "No" appear in the responses for which mobile apps people use?__

In [None]:
#df["mobile apps used"].to_list()

In [None]:
strs = ["google","windy","navionics","aquamaps","tides","atlas","gaia","strava","predict wind","garmin","no","no answer provided"]

uses = pd.Series(index=strs, dtype=float)
uses.name = "percent_responses"
for str in strs:
    has_str = df["mobile apps used"].str.contains(str, case=False)
    uses[str] = 100*has_str.sum()/has_str.shape[0]
    print(f'Percentage of responses containing "{str.title()}": {np.round(100*has_str.sum()/has_str.shape[0],1)}%')

In [None]:
# "title-ize" the app names
uses.index = uses.index.str.title()

In [None]:
uses.sort_values(ascending=True).plot.barh();
plt.title("App names cited by survey respondents")
plt.xlabel("Percentage of responses containing string");

-------------------------------

## What are the characteristics of the respondents to cite “support”, “contribute“, “help” as a membership reason?  

Age group, Membership length

In [None]:
#df["membership reason"].head(10).to_list()

In [None]:
matches = ["support","contribute","help"]
matches = "|".join(matches)
matches

In [None]:
mask = df["membership reason"].str.contains(matches, regex=True)
mask.sum()

In [None]:
#df.columns

In [None]:
n_support = df.loc[mask].groupby(by=["age group","membership duration"]).count()["timestamp"].sort_values(ascending=False)
n_support.name = "number citing support"

In [None]:
n_total = df.groupby(by=["age group","membership duration"]).count()["timestamp"].sort_values(ascending=False)
n_total.name = "total number"

In [None]:
support_df = pd.concat([n_support, n_total], axis=1)

In [None]:
support_df["as_percent"] = 100*n_support/n_total

In [None]:
support_df["as_percent"] = support_df["as_percent"].round(1)

In [None]:
support_df.fillna(0, inplace=True)

In [None]:
support_df["number citing support"] = support_df["number citing support"].astype(int)

In [None]:
support_df

In [None]:
support_df["as_percent"].sort_values().plot.barh(figsize=(10,10));

In [None]:
support_df.groupby("age group").sum()[["number citing support","total number"]]

In [None]:
support_df.groupby("membership duration").sum()[["number citing support","total number"]]

In [None]:
df.loc[mask,"age group"].value_counts()/df["age group"].value_counts()

## What are the characteristics of those who mentioned using “windy”, “Navionics”, “Tides”, “No”, or “null?

Age group, years experience

In [None]:
#df["mobile apps used"]
#df.columns

In [None]:
# select respondents to used the following apps:
matches = ["windy","navionics","tides"]
matches = "|".join(matches)
matches

In [None]:
# select the respondents who did not use apps (or answer the question)
matches = ["no","no answer provided"]
matches = "|".join(matches)
matches

In [None]:
# select the respondents who use google
matches = "google"
matches

In [None]:
mask = df["mobile apps used"].str.contains(matches)
mask.sum()

In [None]:
n_apps = df.loc[mask].groupby(by=["age group","years experience"]).count()["timestamp"].sort_values(ascending=False)
n_apps.name = "number using apps"

In [None]:
n_total = df.groupby(by=["age group","years experience"]).count()["timestamp"].sort_values(ascending=False)
n_total.name = "total number"

In [None]:
apps_df = pd.concat([n_apps, n_total], axis=1)

In [None]:
apps_df["as_percent"] = 100*n_apps/n_total

In [None]:
apps_df["as_percent"] = apps_df["as_percent"].round(1)

In [None]:
apps_df.fillna(0, inplace=True)

In [None]:
apps_df["number using apps"] = apps_df["number using apps"].astype(int)

In [None]:
apps_df

In [None]:
apps_df.groupby("age group").sum()[["number using apps","total number"]]

In [None]:
# columns to explore
columns = ["age group","gender","membership duration","years experience","have volunteered"]

In [None]:
# counts the occurrences of various quantities for those who cite support 
# as their reason to join the BCMT
#for column in columns[0]:
df_age_counts = df_support[columns[1]].value_counts()
df_age_counts.name = "total number"
df_age_counts

In [None]:
# counts the occurrences of various quantities for those who cite support 
# as their reason to join the BCMT
for column in columns:
    print(df_support[column].value_counts(), end=2*"\n")

In [None]:
# counts the occurrences of various quantities for everyone in the survey
for column in columns:
    print(df[column].value_counts(), end=2*"\n")

In [None]:
# the percentage of respondents cite support relative to all respondents, who cite support 
# as their reason to join the BCMT
for column in columns:
    tdf = df_support[column].value_counts()/df[column].value_counts()
    print(100*tdf.sort_values(ascending=False).round(2), end=2*"\n")