In [2]:
import pandas as pd
data = pd.read_csv("thanksgiving.csv", encoding = "Latin-1")
print(data.head())

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   
1    4337951949                            Yes   
2    4337935621                            Yes   
3    4337933040                            Yes   
4    4337931983                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             
1                                             Turkey             
2                                             Turkey             
3                                             Turkey             
4                                           Tofurkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      
1                                                NaN                                      
2                            

In [3]:
print(data.columns)

Index(['RespondentID', 'Do you celebrate Thanksgiving?',
       'What is typically the main dish at your Thanksgiving dinner?',
       'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       'How is the main dish typically cooked?',
       'How is the main dish typically cooked? - Other (please specify)',
       'What kind of stuffing/dressing do you typically have?',
       'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       'What type of cranberry saucedo you typically have?',
       'What type of cranberry saucedo you typically have? - Other (please specify)',
       'Do you typically have gravy?',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       'Which of these side dishes aretypically served

In [10]:
data["Do you celebrate Thanksgiving?"].value_counts()

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64

In [14]:
# filter out any people that don't celebrate thanksgiving
data = data.loc[data["Do you celebrate Thanksgiving?"] == "Yes"]

Yes    980
Name: Do you celebrate Thanksgiving?, dtype: int64


In [15]:
data["What is typically the main dish at your Thanksgiving dinner?"].value_counts()

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

In [20]:
# see if people who have tofurkey for dinner typically have gravy too
tofurkey_filter = data.loc[data["What is typically the main dish at your Thanksgiving dinner?"] \
                           == "Tofurkey"]

gravy_and_tofurkey = tofurkey_filter["Do you typically have gravy?"]
gravy_and_tofurkey

4      Yes
33     Yes
69      No
72      No
77     Yes
145    Yes
175    Yes
218     No
243    Yes
275     No
393    Yes
399    Yes
571    Yes
594    Yes
628     No
774     No
820     No
837    Yes
860     No
953    Yes
Name: Do you typically have gravy?, dtype: object

In [39]:
# see how many people ate pies, if false means they ate at least one type of pie

apple_isnull = pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"])
pumpkin_isnull = pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"])
pecan_isnull = pd.isnull(data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"])

ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull
print(ate_pies.value_counts())

False    876
True     104
dtype: int64


In [50]:
# function to convert age strings to ints and remove the "+" from "60+"

def convert_ages(age_str):
    if pd.isnull(age_str):
        return None
    result = age_str.split(" ")[0]
    result = result.replace("+", "")
    result = int(result)
    return result

In [55]:
# create new column in the df with the converted int ages and show summary stats

data["int_age"] = data["Age"].apply(convert_ages)
data["int_age"].describe()

count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

## We took just the first age in the ranges (e.g. 18 for 18-29) and assumed everyone was the first age (18) in the range when in reality they are not.  This is not a true depiction of the ages of the survey population as we are definitely low on all summary statistics.  

In [59]:
# function to convert incomes to ints from strings 

def convert_incomes(income_str):
    if pd.isnull(income_str):
        return None
    result = income_str.split(" ")[0]
    if result == "Prefer":
        return None
    result = result.replace("$", "")
    result = result.replace(",", "")
    result = int(result)
    return result

In [61]:
# display summary statistics of income

data["int_income"] = data["How much total combined money did all members of your HOUSEHOLD earn last year?"]\
.apply(convert_incomes)
data["int_income"].describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

## Similar to the ages above, because we are just taking the first income in the range this is not a true depiction of income distributions.

In [62]:
# see how far people are willing to travel if their household income is <150K

under_150 = data.loc[data["int_income"]<150000]
under_150["How far will you travel for Thanksgiving?"].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64

In [63]:
# see how far people are willing to travel if their household income is >150K

over_150 = data.loc[data["int_income"] > 150000]
over_150["How far will you travel for Thanksgiving?"].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64

## It seems that people whose household incomes are over 150K travel less for Thanksgiving, although we would want to calculate proportions of each of these categorical responses to get a better idea.

In [64]:
# pivot table showing average age of people who have attended "Friendsgiving" and hometown friends

data.pivot_table(index = "Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns = 'Have you ever attended a "Friendsgiving?"', values = "int_age")


"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


In [67]:
# pivot table showing average income of people who have spent Thanksgiving with friends

data.pivot_table(index = "Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns = 'Have you ever attended a "Friendsgiving?"', values = "int_income")

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78914.549654,72894.736842
Yes,78750.0,66019.736842


## The above two tables show the average age and the average household income of those who have spent Thanksgiving with friends.  From the outset, it appears that those who have spent Thanksgiving with friends are on average younger and have lower household income.  Going forward it would be worthwhile to see if these are statistically significant differences or if this can be attributed to randomness

In [86]:
# figuring out the most common dessert
import numpy as np

dessert_list = data[['Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler', 
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Blondies', 
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Brownies',
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Carrot cake', 
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Cheesecake', 
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Cookies', 
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Fudge', 
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Ice cream', 
'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Peach cobbler']]

dessert_dict = {}

for dessert in dessert_list:
    dessert_name = dessert.split("-")[1]
    dessert_dict[dessert_name] = (data[dessert]).dropna().count()
print(dessert_dict)
maximum = max(dessert_dict, key=dessert_dict.get)
minimum = min(dessert_dict, key = dessert_dict.get)
print(maximum, dessert_dict[maximum])
print(minimum, dessert_dict[minimum])

{' Blondies': 16, ' Peach cobbler': 103, ' Ice cream': 266, ' Cookies': 204, ' Carrot cake': 72, ' Cheesecake': 191, ' Fudge': 43, ' Apple cobbler': 110, ' Brownies': 128}
 Ice cream 266
 Blondies 16


## Of those who eat desserts, Ice Cream is the most popular and Blondies are the least popular.