In [3]:
import pandas as pd

df = pd.read_csv('data.csv')

In [4]:
# Remove the Timestamp column
df = df.drop(columns=['Timestamp'])

In [5]:
# Rename the columns
df = df.rename(columns={
    "Thank you for your participation. If you want to receive the survey results, enter your email address here. We won't share your email address with anyone, nor send you any ads.": 'email',
    "What's your age?": 'age',
    "What's your gender?": 'gender',
    "Where do you live?": 'homeland',
    "Where do you usually enjoy your holidays? Select all that apply.": 'target',
    "When on vacation, where do you usually stay? Select all that apply.": 'accommodation',
    "Who do you usually enjoy your holidays with? Select all that apply.": 'company',
    "What activities best describe your typical summer vacation? Select all that apply.": 'activities',
    "How do you choose where to go on vacation? Select all that apply.": 'decision',
    "What means of transport do you normally take when going on vacation? Select all that apply.": 'transport',
    "How much did you spend in total during your last summer holidays? Use this format: <cost> <currency> (e.g. 350 €)": 'cost',
    "How much did you enjoy your last summer vacation?": 'enjoyment',
    "Thank you for your participation. If you want to receive the survey results, enter your email address here. We won't share your email address with anyone, nor send you any ads.": 'email',
})

In [6]:
# Remove the emails from the data and save the data to a new csv file
df.drop(['email'], axis=1, inplace=True)
df.to_csv('data.csv', index=False)

In [7]:
# Extract the email column into a separate array
# emails = df['email'].values
# Remove the email column from the dataframe
# df = df.drop(columns=['email'])

KeyError: 'email'

In [None]:
# Change all the gender values that aren't 'Male' or 'Female' into 'Other'
df['gender'] = df['gender'].apply(lambda x: 'Other' if x != 'Male' and x != 'Female' else x)

# Plot the gender distribution
import matplotlib.pyplot as plt

genders = df.groupby('gender')['gender'].count()
genders

In [None]:
# Parse the raw cost values and convert to euros

import re
from typing import List, Tuple


# ( [symbols], change to eur )
currency_table: Tuple[List[str], float] = (
    (['€', 'eur', 'euro', 'euros'], 1.00),
    (['$', 'usd', 'dollar', 'dollars'], 1.00),
    (['£', 'gbp', 'pound', 'pounds'], 1.15),
    (['cad'], 0.76),
    (['aud'], 0.67),
    (['chf'], 1.04),
    (['jpy', 'yen', 'yens', '¥', '円'], 0.0069),
    (['inr', '₹'], 0.013),
    (['php', '₱'], 0.018),
    (['sek'], 0.096),
    (['rmb', '¥', '元', 'renminbi'], 0.14),
    (['dkk'], 0.13),
    (['zar'], 0.057),
    (('pen', 'sol'), 0.26),
)


def assume_eur_if_no_symbol(cost: str):
    try:
        cost = cost.replace(',', '').replace('+', '')
        return float(cost)
    except ValueError:
        return None


def find_number(string) -> int | None:
    numbers = re.findall(r'\d+', string)

    if len(numbers) == 0:
        return None

    return max([int(x) for x in numbers])


def parse_cost(cost: str) -> int | None:
    cost = str(cost).lower()

    if cost == 'nan':
        return None

    assume_eur = assume_eur_if_no_symbol(cost)
    if assume_eur is not None:
        return assume_eur

    for symbols, change in currency_table:
        for symbol in symbols:
            if symbol in cost:
                number = find_number(cost)
                if number is None:
                    return None
                return number * change
    
    print(f'Unknown currency: {cost}')
    return None


def parse_and_normalize(cost: str) -> int | None:
    cost = str(cost)
    cost = parse_cost(cost)
    if cost is None:
        return None

    if cost > 10000:
        return 10000
    return cost


df['cost'] = df['cost'].apply(parse_and_normalize)


In [None]:
# Normalize the homeland column

def parse_homeland(string) -> str:
    string = str(string).lower()

    europe_substrings = ['europe', 'italia', 'milano', 'israel']
    if any(sub in string for sub in europe_substrings):
        return 'europe'

    north_america_substrings = ['america', 'usa', 'canada']
    if 'south' not in string and any([sub in string for sub in north_america_substrings]):
        return 'north america'
    
    asia_substrings = ['japan', 'china', 'asia', 'india', 'korea', 'singapore', 'thailand', 'vietnam', 'new zealand']
    if any([sub in string for sub in asia_substrings]):
        return 'asia'
    
    normal_countries = ['australia', 'south america', 'africa']
    if any([sub in string for sub in normal_countries]):
        return string

    # Log other homelands
    print(string)

    return string
    

df['homeland'] = df['homeland'].apply(parse_homeland)



In [None]:
# Parse the accomodation column

def parse_accomodation_name(accomodation) -> str:
    
    normal_accomodations = [
        'hotel or motel',
        'rent a house',
        'family or friend\'s house',
        'camper',
        'tent or hut'
    ]

    if any([x in accomodation for x in normal_accomodations]):
        return accomodation

    if 'rent' in accomodation:
        return 'rent a house'
    
    if 'house' in accomodation:
        return 'family or friend\'s house'

    hotel_substrings = ['hotel', 'motel', 'resort']
    if any([x in accomodation for x in hotel_substrings]):
        return 'hotel or motel'
    
    if accomodation == 'caravan':
        return 'camper'
    
    if 'hostel' in accomodation:
        return 'hostel'

    # Log other accomodations
    print(accomodation)
    
    return accomodation


def parse_accomodations(accomodations) -> str:
    accomodations = str(accomodations).lower()
    accomodations = accomodations.split(';')
    accomodations = [parse_accomodation_name(x) for x in accomodations]
    return ';'.join(accomodations)


df['accommodation'] = df['accommodation'].apply(parse_accomodations)

In [None]:
# Parse target column

normal_targets = [
    'seaside',
    'mountains',
    'lake or river',
    'countryside'
]


def parse_target_name(target) -> str:

    if any([x in target for x in normal_targets]):
        return target
    
    city_substrings = ['city', 'town', 'village', 'cities', 'home', 'urban']
    if any([x in target for x in city_substrings]):
        return 'city'
    
    shooting_substrings = ['shooting', 'hunt', 'hunting', 'battle']
    if any([x in target for x in shooting_substrings]):
        return 'shooting range'

    print(target)

    return target


def parse_targets(targets) -> str:
    targets = str(targets).lower()
    targets = targets.split(';')
    targets = [parse_target_name(x) for x in targets]
    return ';'.join(targets)


df['target'] = df['target'].apply(parse_targets)

In [None]:
# Parse company column

def parse_company_name(company) -> str:

    normal_companies = [
        'family',
        'friends',
        'boyfriend/girlfriend',
        'alone'
    ]

    if company == 'boyfriend/girlfriend' or 'partner' in company:
        return 'partner'

    if any([x in company for x in normal_companies]):
        return company

    print(company)

    return company


def parse_companies(companies) -> str:
    companies = str(companies).lower()
    companies = companies.split(';')
    companies = [parse_company_name(x) for x in companies]
    return ';'.join(companies)


df['company'] = df['company'].apply(parse_companies)

In [None]:
# Parse the transport column

def parse_transport_name(transport) -> str:

    normal_transports = [
        'car',
        'train',
        'airplane',
        'bus or taxi'
    ]

    if any([x in transport for x in normal_transports]):
        return transport
    
    if 'subway' in transport:
        return 'train'

    if 'plane' in transport:
        return 'airplane'
    
    bus_substrings = ['bus', 'taxi', 'uber', 'coach']
    if any([x in transport for x in bus_substrings]):
        return 'bus or taxi'
    
    boat_substrings = ['boat', 'ship', 'ferry']
    if any([x in transport for x in boat_substrings]):
        return 'boat'

    print(transport)

    return transport


def parse_transport(transports) -> str:
    transports = str(transports).lower()
    transports = transports.split(';')
    transports = [parse_transport_name(x) for x in transports]
    return ';'.join(transports)


df['transport'] = df['transport'].apply(parse_transport)

In [None]:
# Parse the age column into ranges

def parse_age(age) -> str:
    
    if age == 'Less than 14 years old':
        return '<14'
    if age == '14 - 16 years old':
        return '14-16'
    if age == '16 - 18 years old':
        return '16-18'
    if age == '18 - 25 years old':
        return '18-25'
    if age == 'More than 25 years old':
        return '>25'
    

df['age'] = df['age'].apply(parse_age)

df['age'].value_counts()



In [None]:
# Parse the activities

def parse_activity_name(activity) -> str:

    normal_activities = [
        'going to the beach',
        'trekking or walking',
        'visiting museums and cultural sites',
        'extreme sports',
        'working',
        'travelling, visiting cities and places of interest',
        'staying at home',
        'cruise',
        'camping',
        'attending concerts or live events (e.g. football match)'
    ]

    if any([x in activity for x in normal_activities]):
        return activity
    
    events_substrings = ['concert', 'match', 'event', 'party', 'club']
    if any([x in activity for x in events_substrings]):
        return 'attending concerts or live events (e.g. football match)'
    
    beach_substrings = ['swimming', 'sunbathing', 'playing in the sand']
    if any([x in activity for x in beach_substrings]):
        return 'going to the beach'

    trekking_substrings = ['trekking', 'walking', 'hiking', 'mountain']
    if any([x in activity for x in trekking_substrings]):
        return 'trekking or walking'

    visiting_substrings = ['visiting', 'tour', 'sightseeing', 'bar', 'shop']
    if any([x in activity for x in visiting_substrings]):
        return 'visiting museums and cultural sites'
    
    sailing_substrings = ['sail', 'yacht', 'boat', 'surf', 'canoe', 'kayak']
    if any([x in activity for x in sailing_substrings]):
        return 'sailing'

    relaxing_substrings = ['relax', 'chill', 'rest', 'sleep', 'read', 'nature', 'fish']
    if any([x in activity for x in relaxing_substrings]):
        return 'relaxing'
    
    shooting_substrings = ['shooting', 'hunt', 'hunting', 'battle', 'arch']
    if any([x in activity for x in shooting_substrings]):
        return 'shooting'

    travelling_substrings = ['zoo', 'cuisine']
    if any([x in activity for x in travelling_substrings]):
        return 'travelling, visiting cities and places of interest'
    
    working_substrings = ['work', 'job', 'office', 'school', 'university', 'studio', 'study']
    if any([x in activity for x in working_substrings]):
        return 'working'

    print(activity)

    return activity


def parse_activities(activities) -> str:
    activities = str(activities).lower()

    activities = map(lambda x: x.strip(), activities.split(';'))

    activities = map(lambda x: parse_activity_name(x), activities)

    return ';'.join(activities)


df['activities'] = df['activities'].apply(parse_activities)



In [None]:
# Parse the deicsion column

def parse_decision_name(decision) -> str:

    normal_decisions = [
        'i have friends there',
        'my relatives live there',
        'i want to visit a certain place',
        'i always go there on vacation, it\'s a tradition',
        'i have a house there'
    ]

    if any([x in decision for x in normal_decisions]):
        return decision

    want_substrings = ['nice', 'agree', 'group', 'partner', 'plan', 'interesting', 'good', 'research', 'weather', 'climate']
    if any([x in decision for x in want_substrings]):
        return 'i want to visit a certain place'
    
    near_substrings = ['near', 'close', 'close to', 'far', 'nice', 'distance', 'cheap', 'price', 'avail', 'afford', 'budget']
    if any([x in decision for x in near_substrings]):
        return 'near and cheap'
    
    parents_substrings = ['parent', 'someone else', 'decides', 'chooses', 'takes me', 'am not', 'wants']
    if any([x in decision for x in parents_substrings]):
        return 'someone else decides'

    print(decision)

    return decision


def parse_decisions(decisions) -> str:
    decisions = str(decisions).lower()

    decisions = map(lambda x: x.strip(), decisions.split(';'))

    decisions = map(lambda x: parse_decision_name(x), decisions)

    return ';'.join(decisions)
    

df['decision'] = df['decision'].apply(parse_decisions)


In [None]:
# Calculate average cost for each age range

costs = df.groupby('age')['cost'].mean()

# Reorder the costs so that the age ranges are in the correct order
costs = costs.reindex(['<14', '14-16', '16-18', '18-25', '>25'])

# Drop the <14 age range because it has too few samples
costs = costs.drop('<14')

# Plot the average cost for each age range
import matplotlib.pyplot as plt

print(costs)

plt.bar(costs.index, costs.values)
plt.show()

# People 14-16 have a small budget because they are still too young to don't do much on their own
# Their parents pay for them and they don't have a job yet

# People 16-18 have a high budget because they start being quite independent and their parents still pay for them
# they start going on vacations with their friends

# People after 18 stop getting money from their parents, so they have a smaller budget
# If they have a job, they aren't paid much, so they have a small budget

# People over 25 have a high budget because they have a job and they can afford to pay for their vacations
# Moreover, people over 25 spend more on their vacations because they do more things

# Save the data to a csv file
costs.to_csv('cost_per_age_range.csv')

In [None]:
# Average cost for each homeland

cost_per_land = df.groupby('homeland')['cost'].mean()
people_per_land = df.groupby('homeland')['cost'].count()

# Remove africa because it has too few people
cost_per_land = cost_per_land.drop('africa')

# Sort from highest to lowest
cost_per_land = cost_per_land.sort_values(ascending=False)

import matplotlib.pyplot as plt

# Print cost per land and number of people per land on the same line
for land, cost in cost_per_land.items():
    print(f'{land}: {cost:.2f} ({people_per_land[land]} people)')
    

plt.bar(cost_per_land.index, cost_per_land.values)
plt.setp(plt.gca().get_xticklabels(), rotation=30, horizontalalignment='right')
plt.show()

# Asian people spend the most on their vacation. This may be due to different buying power or
# because they travel more than other people

# North americans spend more than european people

# Australians spend the least on their vacations

# Save the data to a csv file
cost_per_land.to_csv('cost_per_land.csv')


In [None]:
# Get the most popular activities for each age range
    
activities = df.groupby('age')['activities'].apply(lambda x: ';'.join(x))

# Drop the <14 age range because there is too little data
activities = activities.drop('<14')

# Split the activities into a list
activities = activities.apply(lambda x: x.split(';'))

# Count the number of times each activity appears
activities = activities.apply(lambda x: pd.Series(x).value_counts())

# Substitue NaN with 0
activities = activities.fillna(0)

# Sum the number of times each activity appears for each age range
activities = activities.groupby(activities.index).sum()

POWER_EMPHASIS = 3

# Split the activities into age ranges and sort them from highest to lowest and keep only the top 3
activities_14_16 = activities.loc['14-16'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)
activities_16_18 = activities.loc['16-18'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)
activities_18_25 = activities.loc['18-25'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)
activities_25 = activities.loc['>25'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)

# Plot the most popular activities for each age range
import matplotlib.pyplot as plt

plt.bar(activities_14_16.index, activities_14_16.values)
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

plt.bar(activities_16_18.index, activities_16_18.values)
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

plt.bar(activities_18_25.index, activities_18_25.values)
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

plt.bar(activities_25.index, activities_25.values)
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

activities


In [None]:
# Track the popularity of travelling through the age ranges

age_ranges = df.groupby('age')['activities'].apply(lambda x: ';'.join(x))

# Split the activities into a list
age_ranges = age_ranges.apply(lambda x: x.split(';'))

# Remove the <14 age range because there is too little data
age_ranges = age_ranges.drop('<14')

# Count the number of times "Travelling, visiting cities and places of interest" is mentioned
travelling_per_age = age_ranges.apply(lambda x: x.count('travelling, visiting cities and places of interest'))
# Count the total number of activities for each age range
total_per_age = age_ranges.apply(lambda x: len(x))

# Calculate the percentage of travelling
percentages = [int(trav / total * 100) for trav, total in zip(travelling_per_age, total_per_age)]    

# Plot the popularity of travelling through the age ranges using a curved line graph with filled area
import matplotlib.pyplot as plt

plt.plot(age_ranges.index, percentages, linestyle='solid', linewidth=3, marker='o')
plt.show()

# 14-16 --> People travel with their parents and go where their parents bring them to.

# 16-18 --> People travel less. Spend less time with their parents

# The boom in travelling happens between 18 and 25 years old when people are more independent
# and they have a car to travel with

# People stop travelling as much after 25 probably because of career, family, less time, etc.

travelling_per_age = pd.DataFrame(zip(age_ranges.index, percentages), columns=['age', 'percentage'])
travelling_per_age.set_index('age', inplace=True)
travelling_per_age.to_csv('travelling_per_age.csv')


In [None]:
# Calculate the popularity of going to the beach through the age ranges

age_ranges = df.groupby('age')['activities'].apply(lambda x: ';'.join(x))

# Split the activities into a list
age_ranges = age_ranges.apply(lambda x: x.split(';'))

# Remove the <14 age range because there is too little data
age_ranges = age_ranges.drop('<14')

# Count the number of times "Travelling, visiting cities and places of interest" is mentioned
beach_per_age = age_ranges.apply(lambda x: x.count('going to the beach'))
# Count the total number of activities for each age range
total_per_age = age_ranges.apply(lambda x: len(x))

# Calculate the percentage of travelling
percentages = [int(beach / total * 100) for beach, total in zip(beach_per_age, total_per_age)]    

# Plot the popularity of travelling through the age ranges using a curved line graph with filled area
import matplotlib.pyplot as plt

plt.plot(age_ranges.index, percentages, linestyle='solid', linewidth=3, marker='o')
plt.show()

# Going to the beach becomes less popular as people get older in favor of other activities (and also because of the lack of free time)

beach_per_age = pd.DataFrame(zip(age_ranges.index, percentages), columns=['age', 'percentage'])
beach_per_age.set_index('age', inplace=True)
beach_per_age.to_csv('beach_popularity_per_age.csv')


In [None]:
# Plot the popularity of attending events through the age ranges

age_ranges = df.groupby('age')['activities'].apply(lambda x: ';'.join(x))

# Split the activities into a list
age_ranges = age_ranges.apply(lambda x: x.split(';'))

# Remove the <14 age range because there is too little data
age_ranges = age_ranges.drop('<14')

# Count the number of times "attending concerts or live events (e.g. football match)" is mentioned
events_per_age = age_ranges.apply(lambda x: x.count('attending concerts or live events (e.g. football match)'))
# Count the total number of activities for each age range
total_per_age = age_ranges.apply(lambda x: len(x))

# Calculate the percentage of attending events
percentages = [int(events / total * 100) for events, total in zip(events_per_age, total_per_age)]

# Plot the popularity of attending events through the age ranges using a curved line graph with filled area
import matplotlib.pyplot as plt

plt.plot(age_ranges.index, percentages, linestyle='solid', linewidth=3, marker='o')
plt.show()

# Attending events becomes more popular with age, partially substituting going to the beach

# Initially, people go to events with their parents, then they go with their friends (and go more often)
# between 18 and 25 years old, and then they do other things like travelling and thus attend less events (see the traveling graph)

events_per_age = pd.DataFrame(zip(age_ranges.index, percentages), columns=['age', 'percentage'])
events_per_age.set_index('age', inplace=True)
events_per_age.to_csv('events_popularity_per_age.csv')


In [None]:
# Calculate the average enjoyment through the age ranges

enjoyment_per_age = df.groupby('age')['enjoyment'].apply(lambda x: x.mean())
enjoyment_per_age

# Remove the <14 age range because there is too little data
enjoyment_per_age = enjoyment_per_age.drop('<14')

# Plot the average enjoyment through the age ranges
import matplotlib.pyplot as plt

plt.plot(enjoyment_per_age.index, enjoyment_per_age.values, linestyle='solid', linewidth=3, marker='o')
plt.show()

# Calculate which age range has the highest percentage of people who didn't enjoy their vacation
non_enjoyment_per_age = df.groupby('age')['enjoyment']

# Calculate the percentage of people who didn't enjoy their vacation (enjoyment < 6)
percentages = non_enjoyment_per_age.apply(lambda x: len(x[x < 6]) / len(x) * 100)

# Remove the <14 age range because there is too little data
percentages = percentages.drop('<14')

# Plot the percentage of people who didn't enjoy their vacation through the age ranges

plt.plot(percentages.index, percentages.values, linestyle='solid', linewidth=3, marker='o')
plt.show()

# People tend to enjoy more their vacations as they get older
# Probably because they can do more things and have more money to spend

# Between 16-18 people are less happy because of lack of freedom and opportunities.
# moreover, they are more subject to emotional changes and mood swings, especially if they are alone

# After 18 people are more happy because they are more independent, more emotionally stable and have more money to spend
# Also, they have less free time and thus they are more selective about what they do (and shorter vacation imply less time to be unhappy)

# Save the data to a csv file

enjoyment_per_age = pd.DataFrame(zip(enjoyment_per_age.index, enjoyment_per_age.values), columns=['age', 'enjoyment'])
enjoyment_per_age.set_index('age', inplace=True)
enjoyment_per_age.to_csv('enjoyment_per_age.csv')

non_enjoyment_per_age = pd.DataFrame(zip(percentages.index, percentages.values), columns=['age', 'percentage'])
non_enjoyment_per_age.set_index('age', inplace=True)
non_enjoyment_per_age.to_csv('non_enjoyment_per_age.csv')



In [None]:
# Calculate enjoyment per company

# Split the companies into a list
companies = df['company'].apply(lambda x: x.split(';'))

unique_activities = ['family', 'friends', 'partner', 'alone']
enjoyment = [0, 0, 0, 0]
counts = [0, 0, 0, 0]
for i, company_type in enumerate(unique_activities):
    # Count the number of times each company type is mentioned
    company_count = companies.apply(lambda x: x.count(company_type))

    # Get the enjoyment if the company type is mentioned
    enjoyment[i] = df[company_count > 0]['enjoyment'].mean()

enjoyment

# Plot the average enjoyment per company type
import matplotlib.pyplot as plt

# Emphasize the difference between the company types
enjoyment = [enjoy ** 12 for enjoy in enjoyment]

plt.bar(unique_activities, enjoyment)
plt.show()

# Vacation with family is not so enjoyable.
# Vacation with friends is more enjoyable than vacation with family
# Vacation with partner is the most enjoyable
# Vacation alone is the least enjoyable

# Save the data to a csv file
enjoyment = pd.DataFrame(zip(unique_activities, enjoyment), columns=['company', 'enjoyment'])
enjoyment.set_index('company', inplace=True)
enjoyment.to_csv('enjoyment_per_company.csv')

    

In [None]:
# Caluclate the average enjoyment for each activity

# Split the activities into a list
activities = df['activities'].apply(lambda x: x.split(';'))


# Get all the unique activities
unique_activities = set()
for activity in activities:
    unique_activities.update(activity)
unique_activities = list(unique_activities)

enjoyment = [0] * len(unique_activities)

for i, activity_type in enumerate(unique_activities):
    # Count the number of times each activity type is mentioned
    activity_count = activities.apply(lambda x: x.count(activity_type))

    # Get the enjoyment if the company type is mentioned
    enjoyment[i] = df[activity_count > 0]['enjoyment'].mean()

enjoyment

# Plot the average enjoyment per actiivty type
import matplotlib.pyplot as plt

# Sort the activities by their enjoyment
enjoyment, unique_activities = zip(*reversed(sorted(zip(enjoyment, unique_activities))))

# Remove 'shooting'
enjoyment, unique_activities = zip(*[(enjoy, activity) for enjoy, activity in zip(enjoyment, unique_activities) if activity != 'shooting'])

# Keep only the top 5 activities
enjoyment_top_5 = enjoyment[:5]
unique_activities_top_5 = unique_activities[:5]

# Emphasize the difference between the activity types
enjoyment_top_5 = [enjoy ** 5 for enjoy in enjoyment_top_5]

plt.bar(unique_activities_top_5, enjoyment_top_5)
plt.setp(plt.gca().get_xticklabels(), rotation=20, horizontalalignment='right')
plt.show()


dict(zip(unique_activities, enjoyment))

# Save the top 5 data to a csv file
enjoyment = pd.DataFrame(zip(unique_activities_top_5, enjoyment_top_5), columns=['activity', 'enjoyment'])
enjoyment.set_index('activity', inplace=True)
enjoyment.to_csv('enjoyment_per_activity_top_5.csv')



In [None]:
# Calculate average cost for transport

# Split the transports into a list
transports = df['transport'].apply(lambda x: x.split(';'))

# Get all the unique transports
unique_transports = set()
for transport in transports:
    unique_transports.update(transport)
unique_transports = list(unique_transports)

cost_per_transport = [0] * len(unique_transports)

for i, transport_type in enumerate(unique_transports):
    # Count the number of times each transport type is mentioned
    transport_count = transports.apply(lambda x: x.count(transport_type))

    # Get the cost if the transport type is mentioned
    cost_per_transport[i] = df[transport_count > 0]['cost'].mean()


# Plot the average cost per transport type
import matplotlib.pyplot as plt

# Sort the activities by their cost
cost_per_transport, unique_transports = zip(*reversed(sorted(zip(cost_per_transport, unique_transports))))

# Keep only the top 5 activities
cost_top_5 = cost_per_transport[:5]
unique_activities_top_5 = unique_transports[:5]

# Emphasize the difference between the activity types
cost_top_5 = [c ** 1 for c in cost_top_5]

plt.bar(unique_activities_top_5, cost_top_5)
plt.setp(plt.gca().get_xticklabels(), rotation=20, horizontalalignment='right')
plt.show()


dict(zip(unique_transports, cost_per_transport))

# Save the top 5 data to a csv file
cost_per_transport = pd.DataFrame(zip(unique_activities_top_5, cost_top_5), columns=['transport', 'cost'])
cost_per_transport.set_index('transport', inplace=True)
cost_per_transport.to_csv('cost_per_transport_top_5.csv')


In [None]:
# Calculate average cost per company

# Split the companies into a list
companies = df['company'].apply(lambda x: x.split(';'))

unique_companies = ['family', 'friends', 'partner', 'alone']
cost_per_company = [0, 0, 0, 0]
counts = [0, 0, 0, 0]
for i, company_type in enumerate(unique_companies):
    # Count the number of times each company type is mentioned
    company_count = companies.apply(lambda x: x.count(company_type))

    # Get the cost if the company type is mentioned
    cost_per_company[i] = df[company_count > 0]['cost'].mean()


# Plot the average cost per company type
import matplotlib.pyplot as plt

plt.bar(unique_companies, cost_per_company)
plt.show()

dict(zip(unique_companies, cost_per_company))

# Vacation with family is the most expensive, probably because of kids or because parents pay for their children

# Vacation with friends is also expensive, probably because of the activities that are more expensive, buying food and drinks for everyone, accomodation...

# Vacation with partner is the least expensive, probably because of the activities that are less expensive, buying food and drinks for 2 people, accomodation...
# Often the partner pays for the vacation or the cost is shared.
# People often like to just chill out with their partner and don't do many expensive activities.

# People who spend their vacation alone spend little money, probably because they don't do much in terms of activities.
# Plus, they only have to pay for themselves.

# Save the data to a csv file
cost_per_company = pd.DataFrame(zip(unique_companies, cost_per_company), columns=['company', 'cost'])
cost_per_company.set_index('company', inplace=True)
cost_per_company.to_csv('average_cost_per_company.csv')


In [None]:
# Calculate average cost per gender

genders = df.groupby('gender')['cost'].apply(lambda x: x.mean())

# Plot the average using a bar chart
import matplotlib.pyplot as plt

genders.plot.bar()
plt.show()

# Males seem to spend more on their holidays than females. 

# Save the data to a csv file
genders.to_csv('average_cost_per_gender.csv')


In [None]:
# Track accomodation popularity through the age ranges

# Split the accomodation into a list
accomodation = df.groupby('age')['accommodation'].apply(lambda x: x.str.split(';').sum())

# Remove the <14 age range because there's too little data
accomodation = accomodation.drop('<14')

# for each age range count the times each accomodation type is mentioned
accomodation = accomodation.apply(lambda x: pd.Series(x).value_counts())

# Fill the missing values with 0
accomodation = accomodation.fillna(0)

# Split into age ranges and remove the accomodation where there's no data
accomodation_14_16 = sorted(accomodation.iloc[0], reverse=True)
labels_14_16 = accomodation.iloc[0].index
accomodation_14_16, labels_14_16 = zip(*[(accomodation, label) for accomodation, label in zip(accomodation_14_16, labels_14_16) if accomodation > 0])

accomodation_16_18 = sorted(accomodation.iloc[1], reverse=True)
labels_16_18 = accomodation.iloc[1].index
accomodation_16_18, labels_16_18 = zip(*[(accomodation, label) for accomodation, label in zip(accomodation_16_18, labels_16_18) if accomodation > 0])

accomodation_18_25 = sorted(accomodation.iloc[2], reverse=True)
labels_18_25 = accomodation.iloc[2].index
accomodation_18_25, labels_18_25 = zip(*[(accomodation, label) for accomodation, label in zip(accomodation_18_25, labels_18_25) if accomodation > 0])

accomodation_25_ = sorted(accomodation.iloc[3], reverse=True)
labels_25_ = accomodation.iloc[3].index
accomodation_25_, labels_25_ = zip(*[(accomodation, label) for accomodation, label in zip(accomodation_25_, labels_25_) if accomodation > 0])

# Plot the accomodation popularity for each age range using a bar chart
import matplotlib.pyplot as plt

plt.bar(labels_14_16[:5], accomodation_14_16[:5])
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

plt.bar(labels_16_18[:5], accomodation_16_18[:5])
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

plt.bar(labels_18_25[:5], accomodation_18_25[:5])
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

plt.bar(labels_25_[:5], accomodation_25_[:5])
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()


# Track the popularity of 'family or friend's house' through the age ranges

# Caluclate the number of times 'family or friend's house' is mentioned through each age range
age_ranges = ['14-16', '16-18', '18-25', '25+']
percentages = [0, 0, 0, 0]
for i, age_range in enumerate(age_ranges):
    # Get the number of times 'family or friend's house' is mentioned
    family_friends_house = accomodation.iloc[i]['family or friend\'s house']

    # Get the total number of accomodation types mentioned
    total = accomodation.iloc[i].sum()

    # Calculate the percentage
    percentages[i] = family_friends_house / total * 100

# Plot the percentage using a line chart
import matplotlib.pyplot as plt

plt.title('Popularity of "family or friend\'s house" through the age ranges')
plt.plot(age_ranges, percentages)
plt.show()

# Save the data to a csv file
family_friends_house = pd.DataFrame(zip(age_ranges, percentages), columns=['age_range', 'percentage'])
family_friends_house.set_index('age_range', inplace=True)
family_friends_house.to_csv('family_friends_house_popularity_per_age_range.csv')

print(dict(zip(age_ranges, percentages)))

# Track the popularity of 'hotel or motel' through the age ranges

# Caluclate the number of times 'hotel or motel' is mentioned through each age range
age_ranges = ['14-16', '16-18', '18-25', '25+']
percentages = [0, 0, 0, 0]
for i, age_range in enumerate(age_ranges):
    # Get the number of times 'hotel or motel' is mentioned
    hotel_motel = accomodation.iloc[i]['hotel or motel']

    # Get the total number of accomodation types mentioned
    total = accomodation.iloc[i].sum()

    # Calculate the percentage
    percentages[i] = hotel_motel / total * 100

# Plot the percentage using a line chart

plt.title('Hotel or Motel through the age ranges')
plt.plot(age_ranges, percentages)
plt.show()

# Save the data to a csv file
hotel_motel = pd.DataFrame(zip(age_ranges, percentages), columns=['age_range', 'percentage'])
hotel_motel.set_index('age_range', inplace=True)
hotel_motel.to_csv('hotel_motel_popularity_per_age_range.csv')

print(dict(zip(age_ranges, percentages)))

# Get the age ranges
age_ranges = accomodation.index

# Calculate the percentage of people who chose each accomodation for each age range
accomodation_percentage = accomodation.copy()
for i, age_range in enumerate(age_ranges):
    accomodation_percentage.iloc[i] = accomodation_percentage.iloc[i] / accomodation_percentage.iloc[i].sum() * 100

accomodation_percentage

# Keep only the first 5 columns
accomodation_percentage = accomodation_percentage.iloc[:, :5]

# # Plot the target percentages using a multiline chart
accomodation_percentage.plot.line()

# Save the data to a csv file
accomodation_percentage.to_csv('top_5_accomodation_percentage_per_age_range.csv')

# Hotel or motel is the most popular accomodation type for all age ranges
# For age range 14-16, family or friend's house is roughly the same as hotel or motel
# The popularity of family or friend's house decreases as the age increases
# The popularity of hotel or motel increases as the age increases until it reaches 25+
# After 25+ the accomodation type is more varied, but hotel or motel is still the most popular


In [None]:
# Correlate cost and enjoyment without counting NaN values

# Get the cost and enjoyment columns
cost_enjoyment = df[['cost', 'enjoyment']]
cost_enjoyment = cost_enjoyment.dropna()

costs = cost_enjoyment.groupby('enjoyment')['cost'].apply(lambda x: x.mean())

# Plot a line chart
import matplotlib.pyplot as plt

costs.plot.line()
plt.show()

costs

# The enjoyment of a trip is proportional to the cost

# Save the data to a csv file
costs.to_csv('average_cost_per_enjoyment.csv')

In [None]:
# Calculate enjoyment per gender

enjoyment = df.groupby('gender')['enjoyment'].apply(lambda x: x.mean())

# Plot a bar chart
import matplotlib.pyplot as plt

enjoyment.plot.bar()
plt.show()

enjoyment

# The gender doesn't seem to have a significant impact on the enjoyment

# Save the data to a csv file
enjoyment.to_csv('average_enjoyment_per_gender.csv')

In [None]:
# Get the most popular targets for each age range
    
targets = df.groupby('age')['target'].apply(lambda x: ';'.join(x))

# Drop the <14 age range because there is too little data
targets = targets.drop('<14')

# Split the targets into a list
targets = targets.apply(lambda x: x.split(';'))

# Count the number of times each target appears
targets = targets.apply(lambda x: pd.Series(x).value_counts())

# Substitue NaN with 0
targets = targets.fillna(0)

# Sum the number of times each target appears for each age range
targets = targets.groupby(targets.index).sum()

# POWER_EMPHASIS = 1

# Split the targets into age ranges and sort them from highest to lowest and keep only the top 3
# targets_14_16 = targets.loc['14-16'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)
# targets_16_18 = targets.loc['16-18'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)
# targets_18_25 = targets.loc['18-25'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)
# targets_25 = targets.loc['>25'].sort_values(ascending=False).head(3).apply(lambda x: x ** POWER_EMPHASIS)

# Plot the most popular targets for each age range
# import matplotlib.pyplot as plt

# plt.bar(targets_14_16.index, targets_14_16.values)
# plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()

# plt.bar(targets_16_18.index, targets_16_18.values)
# plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()

# plt.bar(targets_18_25.index, targets_18_25.values)
# plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()

# plt.bar(targets_25.index, targets_25.values)
# plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
# plt.show()

# targets


# Plot the top 5 targets for each age range using a line chart
# Use the age range as the x-axis and the percentage of people who chose the target as the y-axis
import matplotlib.pyplot as plt

# Get the age ranges
age_ranges = targets.index

# Calculate the percentage of people who chose each target for each age range
targets_percentage = targets.copy()
for i, age_range in enumerate(age_ranges):
    targets_percentage.iloc[i] = targets_percentage.iloc[i] / targets_percentage.iloc[i].sum() * 100

targets_percentage

# Keep only the first 5 columns
targets_percentage = targets_percentage.iloc[:, :5]

# Plot the target percentages using a multiline chart
targets_percentage.plot.line()
plt.show()

# The seaside is always the most popular target for all age ranges
# The mountains are the second most popular target for all age ranges, but far behind the seaside

# Save the data to a csv file
targets_percentage.to_csv('top_5_targets_per_age_range.csv')




In [None]:
# Calculate trasport popularity per age range
# and correlate with cost per transport type and cost per age range