# Dataquest.io - Thanksgiving Dinner Project
## 1. Import data and display first 5 rows and columns

In [138]:
import pandas as pd
import numpy as np
data = pd.read_csv("csv/thanksgiving-2015-poll-data.csv", encoding="Latin-1")
print(data.head())
print(data.columns)

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   
1    4337951949                            Yes   
2    4337935621                            Yes   
3    4337933040                            Yes   
4    4337931983                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             
1                                             Turkey             
2                                             Turkey             
3                                             Turkey             
4                                           Tofurkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      
1                                                NaN                                      
2                            

## 2. Remove people who do not celebrate Thanksgiving

Number of categories of answers in the question "Do you celebrate Thanksgiving?"

In [139]:
print(data["Do you celebrate Thanksgiving?"].value_counts())

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64


Remove all the "No" answers.

In [140]:
data = data[data["Do you celebrate Thanksgiving?"] == "Yes"]
print(data["Do you celebrate Thanksgiving?"].value_counts())

Yes    980
Name: Do you celebrate Thanksgiving?, dtype: int64


## 3. Exploring main dishes

In [141]:
print(data["What is typically the main dish at your Thanksgiving dinner?"].value_counts())

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64


In [142]:
tofurkey_main_dish_df = data[data["What is typically the main dish at your Thanksgiving dinner?"] == "Tofurkey"]
print(tofurkey_main_dish_df["Do you typically have gravy?"])

4      Yes
33     Yes
69      No
72      No
77     Yes
145    Yes
175    Yes
218     No
243    Yes
275     No
393    Yes
399    Yes
571    Yes
594    Yes
628     No
774     No
820     No
837    Yes
860     No
953    Yes
Name: Do you typically have gravy?, dtype: object


## 4. Figuring out what pies people eat

In [143]:
apple_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"].isnull()
pumpkin_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"].isnull()
pecan_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"].isnull()
not_ate_pies = data[apple_isnull & pumpkin_isnull & pecan_isnull]
print(len(data) - len(not_ate_pies))

876


## 5. Converting age to numeric

In [144]:
def convert_age(age_s):
    age = 0
    if pd.isnull(age_s):
        age = None
    else:
        age_s = age_s.split(" ")
        age_s[0] = age_s[0].replace("+", "")
        age = int(age_s[0])
    return age

age_int = data["Age"].apply(convert_age)
data["Int_Age"] = age_int
data["Int_Age"].describe()

count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: Int_Age, dtype: float64

### Notes for this step
* Our methodology makes the population's age much lower than actual.
* A better assumption could be the mean of the age range instead of the lowest age.

## 6. Converting income to numeric

In [145]:
def convert_income(income_s):
    income = 0
    if pd.isnull(income_s):
        income = None
    else:
        income_s = income_s.split(" ")
        if income_s[0] == "Prefer":
            income = None
        else:
            income_s[0] = income_s[0].replace(",", "")
            income_s[0] = income_s[0].replace("$", "")
            income = int(income_s[0])
    return income

int_income = data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(convert_income)
data["Int_Income"] = int_income
data["Int_Income"].describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: Int_Income, dtype: float64

### Notes for this step
* This is a very inaccurate method to convert income.
* As a result, the income range `0 to $9,999` would be converted to 0.

## 7. Correlating travel distance and income

In [146]:
inc_under_15k_travel = data["How far will you travel for Thanksgiving?"][data["Int_Income"] < 150000]
inc_under_15k_travel.value_counts()

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64

In [147]:
inc_over_15k_travel = data["How far will you travel for Thanksgiving?"][data["Int_Income"] > 150000]
inc_over_15k_travel.value_counts()

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64

### Let's look at the percentage

In [148]:
inc_under_15k_travel_perc = inc_under_15k_travel.value_counts() / inc_under_15k_travel.value_counts().sum()
inc_under_15k_travel_perc

Thanksgiving is happening at my home--I won't travel at all                         0.407837
Thanksgiving is local--it will take place in the town I live in                     0.294630
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    0.217707
Thanksgiving is out of town and far away--I have to drive several hours or fly      0.079826
Name: How far will you travel for Thanksgiving?, dtype: float64

In [149]:
inc_over_15k_travel_perc = inc_over_15k_travel.value_counts() / inc_over_15k_travel.value_counts().sum()
inc_over_15k_travel_perc

Thanksgiving is happening at my home--I won't travel at all                         0.480392
Thanksgiving is local--it will take place in the town I live in                     0.245098
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    0.156863
Thanksgiving is out of town and far away--I have to drive several hours or fly      0.117647
Name: How far will you travel for Thanksgiving?, dtype: float64

It seems that there is not too big of a difference in terms of Thanksgiving travel distance between the normal and the rich households. The rich does tend to have Thanksgiving at home more often and are more likely to travel far by airplanes.

## 8. Linking friendship and age

In [155]:
data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="Int_Age")

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


## 9. Some potential next steps
### 9.1. Figure out the most common desert people eat
The output would be a dictionary with the number of people who typically have each desert. In this step, I would work on only the first five desserts.

In [167]:
desserts_index = ['Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler', 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Blondies', 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Brownies', 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Carrot cake', 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Cheesecake']
dessert_dict = {}
for dessert in desserts_index:
    is_not_null = pd.notnull(data[dessert])
    dessert_dict[dessert] = len(data[dessert][is_not_null])
dessert_dict

{'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler': 110,
 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Blondies': 16,
 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Brownies': 128,
 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Carrot cake': 72,
 'Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Cheesecake': 191}