# [Eurovision Data Analysis Exercise](https://eds-217-essential-python.github.io/course-materials/eod-practice/eod-day6.html)

In this exercise, you’ll analyze Eurovision Song Contest data using pandas. You’ll practice various data manipulation techniques and explore trends in the contest’s history.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
url = "https://github.com/Spijkervet/eurovision-dataset/releases/download/2020.0/contestants.csv"
eurovision_df = pd.read_csv(url)

## Data exploration and cleaning

In [None]:
# 1. Display the first few rows of the dataset.
eurovision_df.head()

In [None]:
# 2. Check the data types of each column.
eurovision_df.dtypes

In [None]:
# 3. Identify and handle any missing values.
eurovision_df.isnull().sum()
#eurovision_df.dropna()
print(eurovision_df.head())

In [None]:
# 4. Convert the ‘year’ column to datetime type.

eurovision_df['year'] = pd.to_datetime(eurovision_df['year'], format = '%Y').copy()
eurovision_df.head()
#eurovision_df.dtypes

## Task 2: Filtering and Transformation

In [None]:
# 1. Create a new dataframe containing only data from 1990 onwards

euro_1990 = eurovision_df[eurovision_df['year'] >= '1990'].copy()
euro_1990.head()
euro_1990.tail()


In [None]:
euro_1990.columns

In [None]:
# 2. Calculate the difference between final points and semi-final points for each entry
# make a histogram of these values using the builtin dataframe .hist() command.

euro_1990['final_diff'] = euro_1990['points_final'] - euro_1990['points_sf']
#print(euro_1990.shape)
#print(euro_1990['final_diff'].isnull().sum())

euro_1990.hist('final_diff')

## Task 3: Sorting and Aggregation

In [None]:
# 1. Find the top 10 countries with the most Eurovision appearances (use the entire dataset for this calculation)

#top_10 = eurovision_df.groupby('to_country')
#top_10 = top_10['to_country'].count().sort_values(ascending=False).head(10)
#top_10

top_10 = eurovision_df.value_counts('to_country').head(10)
top_10


#.count().sort_values(ascending = False).head(10)
#euro_country_sorted = euro_country_top.sort_values('count', ascending = False).head(10)

In [None]:
# 2. Calculate the average final points for each country across all years. Make a simple bar plot of these data.

final_avg = eurovision_df.groupby('to_country')
final_avg = final_avg['points_final'].mean()
final_avg.plot.bar()

## Task 4

In [None]:
# 1. Determine the country with the highest average final points for each decade.

eurovision_df['decade'] = eurovision_df['year'].dt.year // 10 * 10

highest_country = eurovision_df.groupby(['decade', 'to_country'])['points_final'].mean().groupby('decade').idxmax()

highest_country