# Preliminary Testing

Getting to know what the data actually contains and what can be done with it.

In [64]:
import pandas as pd

data = pd.read_csv("thanksgiving.csv", encoding = "Latin-1")
print(data.head(5))

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   
1    4337951949                            Yes   
2    4337935621                            Yes   
3    4337933040                            Yes   
4    4337931983                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             
1                                             Turkey             
2                                             Turkey             
3                                             Turkey             
4                                           Tofurkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      
1                                                NaN                                      
2                            

In [65]:
data.columns

Index([u'RespondentID', u'Do you celebrate Thanksgiving?',
       u'What is typically the main dish at your Thanksgiving dinner?',
       u'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       u'How is the main dish typically cooked?',
       u'How is the main dish typically cooked? - Other (please specify)',
       u'What kind of stuffing/dressing do you typically have?',
       u'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       u'What type of cranberry saucedo you typically have?',
       u'What type of cranberry saucedo you typically have? - Other (please specify)',
       u'Do you typically have gravy?',
       u'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       u'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       u'Which of these side dishes arety

# Analyzing the number of people who celebrate Thanksgiving

In [66]:
y_n_col = data["Do you celebrate Thanksgiving?"]
#print(y_n_col)

print(y_n_col.value_counts())

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64


In [67]:
# filter out the rows that do not celebrate thanksgiving

data = data[data["Do you celebrate Thanksgiving?"] == "Yes"]

# confirms that only the rows with "Yes" are present in the column
print(data["Do you celebrate Thanksgiving?"].value_counts())

Yes    980
Name: Do you celebrate Thanksgiving?, dtype: int64


# Main dish

In [68]:
print(data["What is typically the main dish at your Thanksgiving dinner?"].value_counts())

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64


In [69]:
main_dish_rows = data[data["What is typically the main dish at your Thanksgiving dinner?"] == "Tofurkey"]
print(main_dish_rows["Do you typically have gravy?"].value_counts())

Yes    12
No      8
Name: Do you typically have gravy?, dtype: int64


# Was a pie eaten at all?

In [70]:
apple_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"].isnull()
pumpkin_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"].isnull()
pecan_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"].isnull()

ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull
print(ate_pies.value_counts())

False    876
True     104
dtype: int64


# Get the integer Age

In [71]:
def extractAge(ser):
    if pd.isnull(ser):
        return None
    
    # split() returns a list of strings, for example,
    # 18 - 29 would yield ["18", "-", "29"]
    # the [0] notation indicates we only want the first element
    # and not the while list
    ser = ser.split(" ")[0] 
    ser = ser.replace("+", " ")
    return int(ser)

In [72]:
data["int_age"] = data["Age"].apply(extractAge)
data["int_age"].describe()

count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

# Findings so far

So far, I have found that majority of the people who celebrated thanksgiving (n = 980) had turkey as the main dish. About 89.3% of these people also had some sort pie as well. 

Of the people who had tofurkey as the main dish, 60% of them also had gravy. 

The results regarding the age are not a true depiction of the survey participants as we only took the first number of the age range. But, if we keep in mind the age-range each number represents then we can get a better idea of the ages of the partcipants.


# Get the integer Income

In [73]:
def extractIncome(str_income):
    if (pd.isnull(str_income)):
        return None
    
    str_income = str_income.split(" ")[0]
    
    if (str_income == "Prefer"):
        return None
    
    str_income = str_income.replace("$", "")
    str_income = str_income.replace(",", "")
    
    return int(str_income)
    

In [74]:
data["int_income"] = data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(extractIncome)
data["int_income"].describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

In [75]:
data["int_income"].value_counts()

25000.0     166
75000.0     127
50000.0     127
100000.0    109
200000.0     76
10000.0      60
0.0          52
125000.0     48
150000.0     38
175000.0     26
Name: int_income, dtype: int64

# Findings

We should be aware that this is not a true depiction of the incomes of the survey partcipants. We only took the first values of the incomes in the income ranges so the values are skewed a little bit downwards.

# Correlating travel distance with income

In [76]:
below_150000 = data[data["int_income"] < 150000]
print(below_150000["How far will you travel for Thanksgiving?"].value_counts())

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64


In [77]:
above_150000 = data[data["int_income"] > 150000]
print(above_150000["How far will you travel for Thanksgiving?"].value_counts())

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64


In [78]:
data.pivot_table(
    index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", 
    columns='Have you ever attended a "Friendsgiving?"', 
    values="int_age")

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


In [79]:
data.pivot_table(
    index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", 
    columns='Have you ever attended a "Friendsgiving?"', 
    values="int_income")

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78914.549654,72894.736842
Yes,78750.0,66019.736842


# Findings

From the two pivot tables above, we find that younger people in their early thirties more often celebrate thanksgiving with their friends and attend a friendsgiving as opposed to a normal thanksgiving. The average income of this group is also lower than the other groups.

# What desert people ate?

In [80]:
apple_cobbler_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler"].isnull()
blondies_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Blondies"].isnull()
brownies_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Brownies"].isnull()
cheesecake_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Cheesecake"].isnull()
cookies_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Cookies"].isnull()
fudge_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Fudge"].isnull()
icecream_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Ice cream"].isnull()
peach_cobbler_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Peach cobbler"].isnull()
none_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - None"].isnull()
other_isnull = data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Other (please specify)"].isnull()

ate_desserts = apple_cobbler_isnull & blondies_isnull & brownies_isnull & cheesecake_isnull & cookies_isnull & fudge_isnull & icecream_isnull & peach_cobbler_isnull & none_isnull & other_isnull

print(ate_desserts.value_counts())

False    952
True      28
dtype: int64


# Findings

It appears that out of the 980 people, about 97.1% of people had some sort of desert on Thanksgiving.