In [4]:
import pandas as pd
import numpy as np
#read data
data = pd.read_csv('thanksgiving.csv', encoding='Latin-1')
#show first row
print(data.head(1))

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      

  How is the main dish typically cooked?  \
0                                  Baked   

  How is the main dish typically cooked? - Other (please specify)  \
0                                                NaN                

  What kind of stuffing/dressing do you typically have?  \
0                                        Bread-based      

  What kind of stuffing/dressing do you typically have? - Other (please specify)  \
0                                                NaN                               

  What type of cranberry saucedo you typically have?  \
0          

In [5]:
#show column values
print(data.columns)

Index(['RespondentID', 'Do you celebrate Thanksgiving?',
       'What is typically the main dish at your Thanksgiving dinner?',
       'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       'How is the main dish typically cooked?',
       'How is the main dish typically cooked? - Other (please specify)',
       'What kind of stuffing/dressing do you typically have?',
       'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       'What type of cranberry saucedo you typically have?',
       'What type of cranberry saucedo you typically have? - Other (please specify)',
       'Do you typically have gravy?',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       'Which of these side dishes aretypically served

In [6]:
# see categories for 'Do you celebrate Thanksgiving?' column value
celebrate_counts = data['Do you celebrate Thanksgiving?'].value_counts()
print(celebrate_counts)

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64


In [7]:
# there are rows which do not celebrate thanksgiving, we will drop them, they are useless to us

#get all row indexes that are not celebrating thanksgiving
celeb_no_rows_index = data[data['Do you celebrate Thanksgiving?'] != 'Yes'].index

#drop all rows that match indexes in the list, modify the DF in place
data.drop(celeb_no_rows_index,inplace=True)

#check if only celebrating rows reamined in the DF
celebrate_counts = data['Do you celebrate Thanksgiving?'].value_counts()
print(celebrate_counts)

Yes    980
Name: Do you celebrate Thanksgiving?, dtype: int64


In [8]:
#see what kind of dishes people do eat for thanksgiving
dishes_cnt = data['What is typically the main dish at your Thanksgiving dinner?'].value_counts()
print(dishes_cnt)

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64


In [9]:
#make filter for rows that have tofurkey for dinner
has_tofurkey_filter = data['What is typically the main dish at your Thanksgiving dinner?'] == 'Tofurkey'
#get rows that have tofurkey for dinner
tofurkey_dish_indexes = data[has_tofurkey_filter]

gravy_tofurkey = tofurkey_dish_indexes['Do you typically have gravy?'] 
print(gravy_tofurkey[0:5])

4     Yes
33    Yes
69     No
72     No
77    Yes
Name: Do you typically have gravy?, dtype: object


In [10]:
def pieStatsString(df, pie_type):
    many_pies = isinstance(pie_type, list)
    print((','.join(pie_type) + ' pies were eaten ' if many_pies else pie_type + ' pie was eaten ') + str(len(df)) + ' times out of total ' + str(len(data)) + ' dinners')

# make boolean series for ate apple pie
applepie_not_ate_filter = pd.isnull(data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple'])
#make boolean series for ate pumpkin pie
pumpkinpie_not_ate_filter = pd.isnull(data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin'])
#make boolean series for ate pecan pie
pecanpie_not_ate_filter = pd.isnull(data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan'])


ate_appliepie = data[applepie_not_ate_filter == False]['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple']
pieStatsString(ate_appliepie, 'Apple')
#with value_counts()
print(ate_appliepie.value_counts())


ate_pumpkin = data[pumpkinpie_not_ate_filter == False]['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin']
pieStatsString(ate_pumpkin, 'Pumpkin')
#with value_counts()
print(ate_pumpkin.value_counts())

ate_pecanpie = data[pecanpie_not_ate_filter == False]['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan']
pieStatsString(ate_pecanpie, 'Pecan')
#with value_counts()
print(ate_pecanpie.value_counts())

#boolean series for ate pies
ate_all_pies_filter = (applepie_not_ate_filter==False) & (pumpkinpie_not_ate_filter==False) & (pecanpie_not_ate_filter==False)

ate_all_pies = data[ate_all_pies_filter]
pieStatsString(ate_all_pies, ['Apple','pumpkin','pecan'])

#let's add additiona column so we can easily get rows that ate all type of pies we were looking into at this cell

data['ate_all_pies'] = ate_all_pies_filter.astype(int)

#lets not fetch it from the table
ate_all_pies_column = data['ate_all_pies']

#let's show value counts for ate_all_pies_col
# 1 is true 0 is false

ate_all_pies_column.value_counts()


Apple pie was eaten 514 times out of total 980 dinners
Apple    514
Name: Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple, dtype: int64
Pumpkin pie was eaten 729 times out of total 980 dinners
Pumpkin    729
Name: Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin, dtype: int64
Pecan pie was eaten 342 times out of total 980 dinners
Pecan    342
Name: Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan, dtype: int64
Apple,pumpkin,pecan pies were eaten 137 times out of total 980 dinners


0    843
1    137
Name: ate_all_pies, dtype: int64

In [11]:
#maps age string with custom logic to int type
#try/except is used instead of checking for null values in argument, since this is known as pythonic way of programming
def parseAge(age_string):
    try:
        if '+' in age_string:
            return int(age_string[0:len(age_string)-1])
        
        return int(age_string.split(' ')[0])
        
    except TypeError as e:
        return None
    
age_col = data['Age']
# check type of age field in row
print(type(age_col.iloc[0]))
#check different categories of ages
#so we can make a parser
age_col.value_counts()

#make new column that will have age values as ints so we can perform some computation on it
data['int_age'] = age_col.apply(lambda age: parseAge(age))

#check if new column looks ok
print(type(data['int_age'].iloc[0]))

data['int_age'].describe()



<class 'str'>
<class 'numpy.float64'>


count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

In [12]:
#Is this a true depiction of the ages of survey participants?
#Not really, summary statistics displayed above could be out of accuracy for the given data set because
#we were taking always the lowest possbile age for each category which means our mean is always lower
#which implies that standard deviation is lower as well, so the actual age population of this sample data set 
#is probably more sparse 

#for instance
#out of 218 people from category 18-29 we took each of them as 18 year old, which is quite bad 
#assumption(probably very bad assumption, especially when it comes to analyzing data)
data['Age'].value_counts()


45 - 59    269
60+        258
30 - 44    235
18 - 29    185
Name: Age, dtype: int64

In [13]:
import re

total_earned_money = data['How much total combined money did all members of your HOUSEHOLD earn last year?']

#display categories for total earned money, so we can write a parser
total_earned_money.value_counts()

#function for parsing money
def parseMoney(money_string):
    try:
        money_string = money_string.lower()
        if 'prefer' in money_string:
            return None

        return int(re.sub('\\$|,', '', money_string.split(' ')[0]))
    except (TypeError, AttributeError) as e:
        return None

#make integer income column so we can do computations 
data['int_income'] = total_earned_money.apply(lambda x: parseMoney(x))

data['int_income'].describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

In [37]:
#Is there anything that we should be aware of about the results or our methodology?
#Is this a true depiction of the incomes of survey participants?

#i think there might be same problem as with age since we are always taking the lower boundary for wage
# which can be quite untrue and therefore describe function gives bad statistics information on the data-set

In [43]:
travel = data['How far will you travel for Thanksgiving?']
income = data['int_income']

income_less_150000 = income < 150000
income_less_150000_rows = data[income_less_150000]
income_less_150000_travel_col = income_less_150000_rows['How far will you travel for Thanksgiving?']
print('People with income lower than 150000')
print(income_less_150000_travel_col.value_counts())
percentage_people_do_travel_income_lower_150000 = 205/len(income_less_150000_travel_col)
percentage_people_do_travel_fly_income_lower_150000 = 55/len(income_less_150000_travel_col)
print(str(percentage_people_do_travel_income_lower_150000) + '% of people with lower income are traveling, going out of town for thanksgiving')
print(str(percentage_people_do_travel_fly_income_lower_150000) + '% of people with lower income might take a plane for thanksgiving')

income_greater_150000 = income > 150000
income_greater_150000_rows = data[income_greater_150000]
income_greater_150000_travel_col = income_greater_150000_rows['How far will you travel for Thanksgiving?']
print('People with income greater than 150000')
print(income_greater_150000_travel_col.value_counts())
percentage_people_do_travel_income_greater_150000 = 28 / len(income_greater_150000_travel_col)
percentage_people_do_travel_fly_income_greater_150000 = 12 / len(income_greater_150000_travel_col)
print(str(percentage_people_do_travel_income_greater_150000) + '% of people with higher income are traveling, goint out of town for thankgsiving')
print(str(percentage_people_do_travel_fly_income_greater_150000) + '% of people with higher income might take a plane for thanksgiving')

People with income lower than 150000
Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64
0.29753265602322204% of people with lower income are traveling, going out of town for thanksgiving
0.07982583454281568% of people with lower income might take a plane for thanksgiving
People with income greater than 150000
Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several

In [44]:
#by looking at the numbers above
#we can say that higher percent of people with greater income are ready to go on longer trip
#that may require affording ticket for the plane


In [80]:
data.pivot_table(values='int_age',index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"')


"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


In [81]:
data.pivot_table(values='int_income', index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"')

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78914.549654,72894.736842
Yes,78750.0,66019.736842


In [82]:
#obesrving the pivot tables above we can make a conclusion that youngest people did attend friendsgiving and met up
#with hometown friends during thanksgiving night
#also those that do meet their friends on thanksgiving and attend a friendsgiving do have lowest incomes as well
#This analysis confirms our previous analysis that higher income people more often go on longer trips which excludes
#option to meet up with a friend in hometown and celebrate thanksgiving