In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
from pprint import pprint
import requests
import time
from scipy import stats
import seaborn as sn
%matplotlib inline

# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy

# Study data files
stanford_path = "Stanford_Data.csv"

stanford_df = pd.read_csv(stanford_path)

In [2]:
# Preview Data Frame
stanford_df.head()

Unnamed: 0,study_id,redcap_survey_identifier,introduction_timestamp,country,state_in_india,state_in_us,province_in_china,state_other_countries,age,race_and_ethnicity,...,sector_of_contribution___4,sector_of_contribution___5,other,where_helped___1,where_helped___2,where_helped___3,where_helped___4,where_helped___5,other_where_helped,social_contributions_complete
0,1,,[not completed],244.0,,,,,,,...,0,0,,0,0,0,0,0,,0
1,2,,4/16/20 14:47,244.0,,,,,2.0,,...,0,0,,0,0,0,0,0,,0
2,3,,4/16/20 14:51,244.0,,,,,3.0,,...,0,0,,0,0,0,0,0,,0
3,4,,4/16/20 14:53,9.0,,,,,3.0,,...,0,0,,0,0,0,0,0,,0
4,5,,4/16/20 15:01,2.0,,,,,5.0,,...,0,0,,0,0,0,1,0,,2


In [3]:
# DROPPING INCOMPLETE SURVEYS WHICH ARE ONES THAT DO NOT HAVE TIMESTAMPS FOR EACH SURVEY MILESTONE.

# Replace "incomplete" or a score of "0" for timestamps with blanks to drop null values.
stanford_df["introduction_timestamp"].replace("[not completed]", np.nan, inplace=True)
# stanford_df.replace("[not completed]", np.nan, inplace=True)
stanford_df["social_interactions_timestamp"].replace("0", np.nan, inplace=True)
stanford_df["hobbies_and_health_timestamp"].replace("0", np.nan, inplace=True)
stanford_df["introduction_timestamp"].replace("0", np.nan, inplace=True)
stanford_df["professional_life_timestamp"].replace("0", np.nan, inplace=True)
stanford_df["social_contributions_timestamp"].replace("0", np.nan, inplace=True)

In [4]:
# Drop Null timestamps to retain completed surveys.
completed_surveys = stanford_df.dropna(subset=["introduction_timestamp", "social_interactions_timestamp", 
                                               "hobbies_and_health_timestamp", "professional_life_timestamp", 
                                               "social_contributions_timestamp"])
completed_surveys.head()

Unnamed: 0,study_id,redcap_survey_identifier,introduction_timestamp,country,state_in_india,state_in_us,province_in_china,state_other_countries,age,race_and_ethnicity,...,sector_of_contribution___4,sector_of_contribution___5,other,where_helped___1,where_helped___2,where_helped___3,where_helped___4,where_helped___5,other_where_helped,social_contributions_complete
4,5,,4/16/20 15:01,2.0,,,,,5.0,,...,0,0,,0,0,0,1,0,,2
5,6,,4/16/20 15:07,2.0,,,,,3.0,,...,0,0,,0,0,0,1,0,,2
6,7,,4/16/20 15:12,7.0,,,,,2.0,,...,0,0,,0,0,0,1,0,,2
7,8,,4/16/20 15:34,244.0,,,,,2.0,,...,0,0,,0,1,0,0,0,,2
8,9,,4/16/20 17:22,244.0,,,,,2.0,,...,0,0,,0,0,0,0,1,,2


In [5]:
# DROPPING UNNECESSARY QUESTIONS

completed_surveys.drop(columns=["introduction_timestamp", "asian_category", "state_in_india", "province_in_china", 
                          "state_other_countries", "introduction_complete", "social_interactions_timestamp", 
                          "social_interactions_complete", "hobbies_and_health_timestamp", 
                          "hobbies_and_health_complete", "professional_life_timestamp", 
                          "professional_life_complete", "social_contributions_timestamp", 
                          "social_contributions_complete"])


# vacay_temp = (cities_df.Temp < 40)
# vacay_windspeed = cities_df.Wind < 10

Unnamed: 0,study_id,redcap_survey_identifier,country,state_in_us,age,race_and_ethnicity,sex,gender_other,marital_status,living_with___1,...,sector_of_contribution___3,sector_of_contribution___4,sector_of_contribution___5,other,where_helped___1,where_helped___2,where_helped___3,where_helped___4,where_helped___5,other_where_helped
4,5,,2.0,,5.0,,1.0,,3.0,0,...,0,0,0,,0,0,0,1,0,
5,6,,2.0,,3.0,,3.0,,8.0,0,...,0,0,0,,0,0,0,1,0,
6,7,,7.0,,2.0,,1.0,,6.0,0,...,0,0,0,,0,0,0,1,0,
7,8,,244.0,,2.0,,1.0,,3.0,1,...,0,0,0,,0,1,0,0,0,
8,9,,244.0,,2.0,,0.0,,3.0,1,...,0,0,0,,0,0,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3415,3416,,1.0,5.0,4.0,4.0,0.0,,3.0,1,...,0,0,0,,0,0,0,0,0,
3416,3417,,1.0,5.0,4.0,1.0,0.0,,3.0,1,...,1,0,1,ordered food from local restarants,0,0,1,0,0,
3418,3419,,2.0,,7.0,1.0,1.0,,3.0,1,...,0,0,0,,0,0,0,0,0,
3419,3420,,1.0,5.0,3.0,4.0,1.0,,4.0,0,...,0,0,0,,0,0,0,0,0,


In [6]:
# Narrowing Down to the US responses.
us_responses = completed_surveys[completed_surveys["country"] == 1]
us_responses

Unnamed: 0,study_id,redcap_survey_identifier,introduction_timestamp,country,state_in_india,state_in_us,province_in_china,state_other_countries,age,race_and_ethnicity,...,sector_of_contribution___4,sector_of_contribution___5,other,where_helped___1,where_helped___2,where_helped___3,where_helped___4,where_helped___5,other_where_helped,social_contributions_complete
39,40,,4/20/20 11:19,1.0,,5.0,,,6.0,1.0,...,1,0,,1,0,1,0,0,,2
40,41,,4/20/20 11:32,1.0,,5.0,,,2.0,1.0,...,0,0,,0,0,1,0,0,,2
41,42,,4/20/20 11:40,1.0,,1.0,,,2.0,1.0,...,0,0,,0,1,0,0,0,,2
43,44,,4/20/20 18:03,1.0,,5.0,,,3.0,2.0,...,0,1,I donated my time/efforts,1,0,0,1,1,In health care sector,2
46,47,,4/20/20 19:49,1.0,,5.0,,,7.0,8.0,...,1,1,srhrdjd,1,1,1,1,1,rshrdjdcv,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3411,3412,,10/9/20 16:34,1.0,,5.0,,,4.0,1.0,...,0,1,Helping neighbors,0,0,0,0,1,Community,2
3412,3413,,10/9/20 20:33,1.0,,47.0,,,1.0,1.0,...,1,0,,1,1,0,1,0,,2
3415,3416,,11/19/20 11:10,1.0,,5.0,,,4.0,4.0,...,0,0,,0,0,0,0,0,,2
3416,3417,,11/25/20 8:43,1.0,,5.0,,,4.0,1.0,...,0,1,ordered food from local restarants,0,0,1,0,0,,2


In [7]:
# KEY QUESTIONS AND DROPPING NULLs

# Drop Null Values in Physical Activities column that talks about physical activities post-COVID.
us_responses["covid_exercise"].dropna(inplace = True)
us_responses["physical_activities"].dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
# # # Convert key preexisting disease columns to integers
# # (us_responses["diabetes"]us_responses(us_responses[cardiovascular_disorders] !=1) | 
# #                               us_responses(us_responses[obesity] !=1) | 
# #                               us_responses(us_responses[respiratory_infections] !=1) | 
# #                               us_responses(us_responses[respiratory_disorders_exam] !=1) | 
# #                               us_responses(us_responses[gastrointestinal_disorders] !=1) | 
# #                               us_responses(us_responses[chronic_kidney_disease] !=1) | 
# #                               us_responses(us_responses[autoimmune_disease] !=1) | 
# #                               us_responses(us_responses[chronic_fatigue_syndrome_a] !=1)]

# us_responses[["diabetes", "cardiovascular_disorders","obesity", "respiratory_infections", "respiratory_disorders_exam", 
#     "gastrointestinal_disorders", "chronic_kidney_disease", "autoimmune_disease", "chronic_fatigue_syndrome_a"]] = us_responses[["diabetes", "cardiovascular_disorders","obesity", "respiratory_infections", "respiratory_disorders_exam", 
#     "gastrointestinal_disorders", "chronic_kidney_disease", "autoimmune_disease", "chronic_fatigue_syndrome_a"]].fillna(0.0).astype(int)

In [9]:
preexisting_us = us_responses[(us_responses["diabetes"]=="1") | 
                              us_responses(us_responses["cardiovascular_disorders"]=="1") | 
                              us_responses(us_responses["obesity"]=="1") | 
                              us_responses(us_responses["respiratory_infections"]=="1") | 
                              us_responses(us_responses["respiratory_disorders_exam"]=="1") | 
                              us_responses(us_responses["gastrointestinal_disorders"]=="1") | 
                              us_responses(us_responses["chronic_kidney_disease"]=="1") | 
                              us_responses(us_responses["autoimmune_disease"]=="1") | 
                              us_responses(us_responses["chronic_fatigue_syndrome_a"]=="1")]

TypeError: 'DataFrame' object is not callable

In [None]:
physical_mean = us_responses.groupby(["country"],as_index=True).physical_activities.mean()
covid_mean = us_responses.groupby(["country"],as_index=True).covid_exercise.median()
# tumorvolume_variance = cleaned_df.groupby(["Drug_Regimen"],as_index=True).Tumor_Volume.max() - cleaned_df.groupby(["Drug_Regimen"],as_index=True).Tumor_Volume.min()
# tumorvolume_std = cleaned_df.groupby(["Drug_Regimen"],as_index=True).Tumor_Volume.std()
# tumorvolume_sem = cleaned_df.groupby(["Drug_Regimen"],as_index=True).Tumor_Volume.sem()
print(physical_mean)
print(covid_mean)

In [None]:
plt.bar(completed_surveys["physical_activities"])
plt.show() 

In [None]:
# Dependent Values for Scatter Plot
plt.figure(figsize=(20,10))
plt.bar(completed_surveys["physical_activities"])

# Size, Labels, and Grid
plt.title("Before COVID vs. During COVID Physical Activity")
plt.ylabel("During COVID Activity")
plt.xlabel("Before COVID Activity")
plt.grid(True)

# # Save the Plot
# plt.savefig("Script Outputs/Lat_v_Temp.png")

# Show plot
plt.show()