## Dataset: survey

In [22]:
import pandas as pd

### Import dataset

In [23]:
df_survey_clean = pd.read_csv("data/raw/survey.csv")

### Exploratory data analysis

In [3]:
df_survey_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5027 entries, 0 to 5026
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Survey ResponseID           5027 non-null   object
 1   Q-demos-age                 5027 non-null   object
 2   Q-demos-hispanic            5027 non-null   object
 3   Q-demos-race                5027 non-null   object
 4   Q-demos-education           5027 non-null   object
 5   Q-demos-income              5027 non-null   object
 6   Q-demos-gender              5027 non-null   object
 7   Q-sexual-orientation        5027 non-null   object
 8   Q-demos-state               5027 non-null   object
 9   Q-amazon-use-howmany        5027 non-null   object
 10  Q-amazon-use-hh-size        5027 non-null   object
 11  Q-amazon-use-how-oft        5027 non-null   object
 12  Q-substance-use-cigarettes  5027 non-null   object
 13  Q-substance-use-marijuana   5027 non-null   obje

In [4]:
df_survey_clean["Survey ResponseID"].nunique()

5027

In [5]:
df_survey_clean["Q-sexual-orientation"].value_counts()

Q-sexual-orientation
heterosexual (straight)    3858
LGBTQ+                     1111
prefer not to say            58
Name: count, dtype: int64

### Data cleaning

In [6]:
# Edit "Q-demos-gender" column
def replace_gender(gender):
    if gender in ["Prefer not to say", "Other"]:
        return "Other or prefer not to say"
    return gender


# Aplicando a função à coluna
df_survey_clean["Q-demos-gender"] = df_survey_clean["Q-demos-gender"].apply(
    replace_gender
)


# Edit "Q-demos-age" column
def clean_age_category(age):
    if "and older" in age:
        return "65+"
    return age.replace(" years", "")


df_survey_clean["Q-demos-age"] = df_survey_clean["Q-demos-age"].apply(
    clean_age_category
)


# Edit "Q-demos-education" column
def remove_parentheses(text):
    return text.split("(")[0].strip()


df_survey_clean["Q-demos-education"] = df_survey_clean["Q-demos-education"].apply(
    remove_parentheses
)


# Edit "Q-demos-income" column
def clean_income(value):
    if "Less than" in value:
        return "Under $25K"
    elif "or more" in value:
        return "Over $150K"
    elif "Prefer not to say" in value:
        return value
    else:
        value = value.replace("$", "").replace(",", "").replace(" - ", "-")
        min_val, max_val = value.split("-")
        min_val = int(min_val) // 1000
        max_val = int(max_val) // 1000
        return f"${min_val} - ${max_val}.9K"


df_survey_clean["Q-demos-income"] = df_survey_clean["Q-demos-income"].apply(
    clean_income
)

# Edit "Q-sexual-orientation" column
df_survey_clean["Q-sexual-orientation"] = df_survey_clean["Q-sexual-orientation"].apply(
    remove_parentheses
)


def capitalized_case(s):
    if pd.isna(s):
        return s
    words = s.split("_")
    return " ".join([words[0].capitalize()] + [word.lower() for word in words[1:]])


df_survey_clean["Q-sexual-orientation"] = df_survey_clean["Q-sexual-orientation"].apply(
    capitalized_case
)

# Edit "Q-amazon-use-howmany" column
df_survey_clean["Q-amazon-use-howmany"] = df_survey_clean["Q-amazon-use-howmany"].apply(
    remove_parentheses
)

# Edit "Q-amazon-use-hh-size" column
df_survey_clean["Q-amazon-use-hh-size"] = df_survey_clean["Q-amazon-use-hh-size"].apply(
    remove_parentheses
)


# Edit "Q-amazon-use-how-oft" column
def clean_frequency(value):
    if "Less than" in value:
        return "< 5 times/month"
    elif "More than" in value:
        return "> 10 times/month"
    else:
        return value.replace("times per month", "times/month")


df_survey_clean["Q-amazon-use-how-oft"] = df_survey_clean["Q-amazon-use-how-oft"].apply(
    clean_frequency
)

# Edit "Q-demos-state" column
df_survey_clean["Q-demos-state"] = df_survey_clean["Q-demos-state"].replace(
    "I did not reside in the United States", "No information"
)

In [7]:
df_survey_clean["Q-demos-state"].value_counts().sort_index()

Q-demos-state
Alabama                  67
Alaska                   10
Arizona                  95
Arkansas                 46
California              505
Colorado                 91
Connecticut              39
Delaware                 13
District of Columbia     14
Florida                 327
Georgia                 161
Hawaii                   21
Idaho                    18
Illinois                216
Indiana                 121
Iowa                     46
Kansas                   39
Kentucky                 91
Louisiana                56
Maine                    19
Maryland                103
Massachusetts           117
Michigan                164
Minnesota                97
Mississippi              35
Missouri                 67
Montana                   8
Nebraska                 34
Nevada                   54
New Hampshire            21
New Jersey              117
New Mexico               27
New York                300
No information            2
North Carolina          189
North 

### Merge dataset: Survey + US Region

In [8]:
# Import region state dataset
df_census = pd.read_csv("data/raw/us_census_bureau_regions_divisions.csv")

In [9]:
df_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   State       51 non-null     object
 1   State Code  51 non-null     object
 2   Region      51 non-null     object
 3   Division    51 non-null     object
dtypes: object(4)
memory usage: 1.7+ KB


### Data cleaning

In [10]:
# Add new row "No information"
df_census.loc[len(df_census)] = ["No information"] * 4

In [11]:
# rename column
df_census = df_census.rename(columns={"State": "Q-demos-state"})

### Merge dataset

In [12]:
df_survey = pd.merge(df_census, df_survey_clean, on="Q-demos-state", how="inner")

### Reduce dataset size

In [13]:
# Convert columns to "category"
df_survey = df_survey.apply(
    lambda col: (
        col.astype("category")
        if col.dtype == "object" and col.name != "Survey ResponseID"
        else col
    )
)

In [14]:
df_survey["Survey ResponseID"].nunique()

5027

### Reorder columns

In [15]:
df_survey.columns.tolist()

['Q-demos-state',
 'State Code',
 'Region',
 'Division',
 'Survey ResponseID',
 'Q-demos-age',
 'Q-demos-hispanic',
 'Q-demos-race',
 'Q-demos-education',
 'Q-demos-income',
 'Q-demos-gender',
 'Q-sexual-orientation',
 'Q-amazon-use-howmany',
 'Q-amazon-use-hh-size',
 'Q-amazon-use-how-oft',
 'Q-substance-use-cigarettes',
 'Q-substance-use-marijuana',
 'Q-substance-use-alcohol',
 'Q-personal-diabetes',
 'Q-personal-wheelchair',
 'Q-life-changes',
 'Q-sell-YOUR-data',
 'Q-sell-consumer-data',
 'Q-small-biz-use',
 'Q-census-use',
 'Q-research-society']

In [16]:
new_order = [
    "Survey ResponseID",
    "Q-demos-state",
    "State Code",
    "Region",
    "Division",
    "Q-demos-age",
    "Q-demos-hispanic",
    "Q-demos-race",
    "Q-demos-education",
    "Q-demos-income",
    "Q-demos-gender",
    "Q-sexual-orientation",
    "Q-amazon-use-howmany",
    "Q-amazon-use-hh-size",
    "Q-amazon-use-how-oft",
    "Q-substance-use-cigarettes",
    "Q-substance-use-marijuana",
    "Q-substance-use-alcohol",
    "Q-personal-diabetes",
    "Q-personal-wheelchair",
    "Q-life-changes",
    "Q-sell-YOUR-data",
    "Q-sell-consumer-data",
    "Q-small-biz-use",
    "Q-census-use",
    "Q-research-society",
]

In [17]:
df_survey = df_survey[new_order]

In [18]:
df_survey.head(2)

Unnamed: 0,Survey ResponseID,Q-demos-state,State Code,Region,Division,Q-demos-age,Q-demos-hispanic,Q-demos-race,Q-demos-education,Q-demos-income,...,Q-substance-use-marijuana,Q-substance-use-alcohol,Q-personal-diabetes,Q-personal-wheelchair,Q-life-changes,Q-sell-YOUR-data,Q-sell-consumer-data,Q-small-biz-use,Q-census-use,Q-research-society
0,R_2dYGdPlFmvZpCk6,Alaska,AK,West,Pacific,25 - 34,No,White or Caucasian,Bachelor's degree,$100 - $149.9K,...,No,No,Yes,No,Moved place of residence,No,No,No,Yes,Yes
1,R_DAfQp2Y7dO2oVvX,Alaska,AK,West,Pacific,25 - 34,No,White or Caucasian,High school diploma or GED,$25 - $49.9K,...,I stopped in the recent past,Yes,No,No,Moved place of residence,Yes if I get part of the profit,Yes if consumers get part of the profit,Yes,Yes,Yes


In [19]:
df_survey["Q-demos-gender"].value_counts()

Q-demos-gender
Female                        2589
Male                          2311
Other or prefer not to say     127
Name: count, dtype: int64

In [20]:
df_survey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5027 entries, 0 to 5026
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   Survey ResponseID           5027 non-null   object  
 1   Q-demos-state               5027 non-null   category
 2   State Code                  5027 non-null   category
 3   Region                      5027 non-null   category
 4   Division                    5027 non-null   category
 5   Q-demos-age                 5027 non-null   category
 6   Q-demos-hispanic            5027 non-null   category
 7   Q-demos-race                5027 non-null   category
 8   Q-demos-education           5027 non-null   category
 9   Q-demos-income              5027 non-null   category
 10  Q-demos-gender              5027 non-null   category
 11  Q-sexual-orientation        5027 non-null   category
 12  Q-amazon-use-howmany        5027 non-null   category
 13  Q-amazon-use-hh-si

### Save dataset

In [21]:
df_survey.to_parquet("data/ready/amazon_survey.parquet")