In [1]:
!pip install duckdb -q

[0m

In [2]:
import pandas as pd
import duckdb

In [3]:
athlete_events_df = pd.read_csv('/kaggle/input/120-years-of-olympic-history-athletes-and-results/athlete_events.csv')
athlete_events_df.sample(3)

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
119274,60383,Kim Yun-Man,M,20.0,174.0,72.0,South Korea,KOR,1994 Winter,1994,Winter,Lillehammer,Speed Skating,"Speed Skating Men's 1,000 metres",
38511,19805,Somchai Chanthavanij,M,41.0,175.0,75.0,Thailand,THA,1988 Summer,1988,Summer,Seoul,Shooting,Shooting Mixed Skeet,
108135,54662,Gunnar Jervill,M,26.0,174.0,70.0,Sweden,SWE,1972 Summer,1972,Summer,Munich,Archery,Archery Men's Individual,Silver


In [4]:
regions_df = pd.read_csv('/kaggle/input/120-years-of-olympic-history-athletes-and-results/noc_regions.csv')
regions_df.sample(3)

Unnamed: 0,NOC,region,notes
217,UZB,Uzbekistan,
66,EUN,Russia,
212,UKR,Ukraine,


## 1. How many olympics games have been held?

In [5]:
duckdb.query("""
        SELECT COUNT(distinct Games) as Total_olympic_games FROM athlete_events_df
""").df()

Unnamed: 0,Total_olympic_games
0,51


## 2. List down all Olympics games held so far.

In [6]:
duckdb.query("""
            SELECT Year, Season, City
            FROM athlete_events_df
            GROUP BY Year, Season, City
            ORDER BY Year
""").df()

Unnamed: 0,Year,Season,City
0,1896,Summer,Athina
1,1900,Summer,Paris
2,1904,Summer,St. Louis
3,1906,Summer,Athina
4,1908,Summer,London
5,1912,Summer,Stockholm
6,1920,Summer,Antwerpen
7,1924,Summer,Paris
8,1924,Winter,Chamonix
9,1928,Summer,Amsterdam


## 3. Mention the total no of nations who participated in each olympics game?

In [7]:
duckdb.query("""
            SELECT Games, COUNT(distinct NOC) as Total_countries
            FROM athlete_events_df
            GROUP BY Games
            ORDER BY Games
""").df()

Unnamed: 0,Games,Total_countries
0,1896 Summer,12
1,1900 Summer,31
2,1904 Summer,15
3,1906 Summer,21
4,1908 Summer,22
5,1912 Summer,29
6,1920 Summer,29
7,1924 Summer,45
8,1924 Winter,19
9,1928 Summer,46


## 4. Which year saw the highest and lowest no of countries participating in olympics

In [8]:
duckdb.query("""
            WITH t1 AS (SELECT Games, COUNT(distinct NOC) as Total_countries
                        FROM athlete_events_df
                        GROUP BY Games
                        ORDER BY Games),
                 t2 AS (SELECT MAX(Total_countries) AS highest_country , MIN(Total_countries) AS lowest_country FROM t1)
            SELECT * FROM t2
                 
""").df()

Unnamed: 0,highest_country,lowest_country
0,207,12


## 5. Which nation has participated in all of the olympic games

In [9]:
duckdb.query("""
            WITH t1 AS (SELECT REGIONS.Region as Region, COUNT(DISTINCT EVENTS.Games) AS Total_games
                        FROM athlete_events_df AS EVENTS
                        JOIN regions_df AS REGIONS
                        ON EVENTS.NOC = REGIONS.NOC
                        GROUP BY Region
                        ORDER BY Total_games DESC),
                        
                 t2 AS (SELECT *, DENSE_RANK() OVER(ORDER BY Total_games DESC) AS Rank FROM t1)
            SELECT * FROM t2 WHERE Rank = 1
""").df()

Unnamed: 0,Region,Total_games,Rank
0,Italy,51,1
1,Switzerland,51,1
2,UK,51,1
3,France,51,1


## 6. Identify the sport which was played in all summer olympics.

In [10]:
duckdb.query("""
            WITH t1 as (SELECT COUNT(DISTINCT Games) as num_summer_games 
                         FROM athlete_events_df
                         WHERE Season = 'Summer'),

            t2 as (SELECT DISTINCT Sport, Games
                        FROM athlete_events_df WHERE Season = 'Summer'
                        ORDER BY Games),

            t3 as (SELECT Sport, COUNT(Games) as num_of_games
                        FROM t2
                        GROUP BY Sport)

            SELECT * FROM t3
            INNER JOIN t1 on t1.num_summer_games = t3.num_of_games;
""").df()

Unnamed: 0,Sport,num_of_games,num_summer_games
0,Gymnastics,29,29
1,Fencing,29,29
2,Cycling,29,29
3,Swimming,29,29
4,Athletics,29,29


## 7. Which Sports were just played only once in the olympics.

In [11]:
duckdb.query("""
            SELECT Sport, COUNT (distinct Games) as num_of_games
            FROM athlete_events_df
            GROUP BY Sport
            HAVING num_of_games = 1
""").df()

Unnamed: 0,Sport,num_of_games
0,Rugby Sevens,1
1,Jeu De Paume,1
2,Cricket,1
3,Aeronautics,1
4,Roque,1
5,Motorboating,1
6,Basque Pelota,1
7,Military Ski Patrol,1
8,Racquets,1
9,Croquet,1


## 8. Fetch the total no of sports played in each olympic games.

In [12]:
duckdb.query("""
            SELECT Games, COUNT (distinct Sport) as num_of_sport
            FROM athlete_events_df
            GROUP BY Games
            ORDER BY num_of_sport DESC
""").df()

Unnamed: 0,Games,num_of_sport
0,2008 Summer,34
1,2016 Summer,34
2,2004 Summer,34
3,2000 Summer,34
4,2012 Summer,32
5,1996 Summer,31
6,1992 Summer,29
7,1988 Summer,27
8,1920 Summer,25
9,1984 Summer,25


## 9. Fetch oldest athletes to win a gold medal

In [13]:
duckdb.query("""
            WITH t1 as (SELECT Name, Sex, Age, Team, Games, City, Sport, Event, Medal
                        FROM athlete_events_df
                        WHERE Medal = 'Gold'),
                 t2 as (SELECT Name, Sex, MAX(Age) as Age, Team, Games, City, Sport, Event, Medal
                        FROM t1
                        GROUP BY Name, Sex, Age, Team, Games, City, Sport, Event, Medal
                        HAVING Age NOT NULL
                        ORDER BY Age DESC),
                 t3 AS (SELECT *, DENSE_RANK() OVER(ORDER BY Age DESC) AS Rank FROM t2)
            
            SELECT * FROM t3
            WHERE Rank = 1
""").df()

Unnamed: 0,Name,Sex,Age,Team,Games,City,Sport,Event,Medal,Rank
0,Oscar Gomer Swahn,M,64.0,Sweden,1912 Summer,Stockholm,Shooting,"Shooting Men's Running Target, Single Shot, Team",Gold,1
1,Charles Jacobus,M,64.0,United States,1904 Summer,St. Louis,Roque,Roque Men's Singles,Gold,1


## 10. Find the Ratio of male and female athletes participated in all olympic games.

In [14]:
duckdb.query("""
            WITH t1 AS (SELECT COUNT(Sex) AS Males 
                        FROM athlete_events_df
                        WHERE Sex = 'M' ),
                 t2 AS (SELECT COUNT(Sex) AS Females 
                        FROM athlete_events_df
                        WHERE Sex = 'F' )

            SELECT CONCAT( '1', ' : ', CAST(ROUND(t1.Males/t2.Females, 2) AS VARCHAR)) AS Ratio FROM t1, t2
""").df()

Unnamed: 0,Ratio
0,1 : 2.64


## 11. Fetch the top 5 athletes who have won the most gold medals.

In [15]:
duckdb.query("""
            WITH t1 AS (SELECT Name, Team, COUNT(Medal) as Total_medals
                        FROM athlete_events_df
                        WHERE Medal = 'Gold'
                        GROUP BY Name, Team
                        ORDER BY Total_medals DESC),
                 t2 AS (SELECT *, DENSE_RANK() OVER(ORDER BY Total_medals DESC) AS Rank FROM t1)
              
            SELECT Name, Team, Total_medals  FROM t2 WHERE Rank <= 5;
""").df()

Unnamed: 0,Name,Team,Total_medals
0,"Michael Fred Phelps, II",United States,23
1,"Raymond Clarence ""Ray"" Ewry",United States,10
2,Paavo Johannes Nurmi,Finland,9
3,"Frederick Carlton ""Carl"" Lewis",United States,9
4,Larysa Semenivna Latynina (Diriy-),Soviet Union,9
5,Mark Andrew Spitz,United States,9
6,"Matthew Nicholas ""Matt"" Biondi",United States,8
7,Usain St. Leo Bolt,Jamaica,8
8,Ole Einar Bjrndalen,Norway,8
9,Sawao Kato,Japan,8


## 12. Fetch the top 5 athletes who have won the most medals (gold/silver/bronze).

In [16]:
duckdb.query("""
            WITH t1 AS (SELECT Name, Team, COUNT(Medal) as Total_medals
                        FROM athlete_events_df
                        GROUP BY Name, Team
                        ORDER BY Total_medals DESC),
                 t2 AS (SELECT *, DENSE_RANK() OVER(ORDER BY Total_medals DESC) AS Rank FROM t1)

            SELECT Name, Team, Total_medals  FROM t2 WHERE Rank <= 5;
""").df()

Unnamed: 0,Name,Team,Total_medals
0,"Michael Fred Phelps, II",United States,28
1,Larysa Semenivna Latynina (Diriy-),Soviet Union,18
2,Nikolay Yefimovich Andrianov,Soviet Union,15
3,Edoardo Mangiarotti,Italy,13
4,Ole Einar Bjrndalen,Norway,13
5,Takashi Ono,Japan,13
6,Borys Anfiyanovych Shakhlin,Soviet Union,13
7,Aleksey Yuryevich Nemov,Russia,12
8,Paavo Johannes Nurmi,Finland,12
9,Ryan Steven Lochte,United States,12


## 13. Fetch the top 5 most successful countries in olympics. Success is defined by no of medals won.

In [17]:
duckdb.query("""
             WITH t1 AS (SELECT Team, COUNT(Medal) as Total_medals
                        FROM athlete_events_df
                        GROUP BY Team
                        ORDER BY Total_medals DESC),
                  t2 AS (SELECT *, DENSE_RANK() OVER(ORDER BY Total_medals DESC) AS Rank FROM t1)
                  
             SELECT *  FROM t2 WHERE Rank <=5;
""").df()

Unnamed: 0,Team,Total_medals,Rank
0,United States,5219,1
1,Soviet Union,2451,2
2,Germany,1984,3
3,Great Britain,1673,4
4,France,1550,5


## 14. List down total gold, silver and bronze medals won by each country.

In [18]:
duckdb.query("""
            WITH t1 AS (SELECT Team AS Country, COUNT(Medal) as Gold
                        FROM athlete_events_df
                        WHERE Medal = 'Gold'
                        GROUP BY Team
                        ORDER BY Gold DESC),
                 t2 AS (SELECT Team AS Country, COUNT(Medal) as Silver
                        FROM athlete_events_df
                        WHERE Medal = 'Silver'
                        GROUP BY Team
                        ORDER BY Silver DESC),
                 t3 AS (SELECT Team AS Country, COUNT(Medal) as Bronze
                        FROM athlete_events_df
                        WHERE Medal = 'Bronze'
                        GROUP BY Team
                        ORDER BY Bronze DESC),

                 t4 AS (SELECT t1.Country, t1.Gold, t2.Silver, t3.Bronze FROM t1
                        LEFT JOIN t2 ON t2.Country = t1.Country
                        LEFT JOIN t3 ON t3.Country = t2.Country)

            SELECT Country, coalesce(Gold, 0) AS Gold, coalesce(Silver, 0) AS Silver, coalesce(Bronze, 0) AS Bronze FROM t4;             
""").df()

Unnamed: 0,Country,Gold,Silver,Bronze
0,United States,2474,1512,1233
1,Soviet Union,1058,716,677
2,Germany,679,627,678
3,Italy,535,508,484
4,Great Britain,519,582,572
...,...,...,...,...
237,Hong Kong,1,0,0
238,Mozambique,1,0,0
239,Belgium-1,1,0,0
240,Suriname,1,0,0


## 15. List down total gold, silver and bronze medals won by each country corresponding to each olympic games.

In [19]:
duckdb.query("""
            SELECT EVENTS.Games,REG.Region as Region,
            COUNT(CASE WHEN EVENTS.Medal='Gold'   THEN 1 End) AS Gold,
            COUNT(CASE WHEN EVENTS.Medal='Silver' THEN 1 End) AS Silver,
            COUNT(CASE WHEN EVENTS.Medal='Bronze' THEN 1 End) AS Bronze
            FROM athlete_events_df as EVENTS
            JOIN regions_df as REG ON EVENTS.NOC = REG.NOC
            GROUP BY EVENTS.Games,REG.Region
            ORDER BY EVENTS.Games,REG.Region;             
""").df()

Unnamed: 0,Games,Region,Gold,Silver,Bronze
0,1896 Summer,Australia,2,0,1
1,1896 Summer,Austria,2,1,2
2,1896 Summer,Denmark,1,2,3
3,1896 Summer,France,5,4,2
4,1896 Summer,Germany,25,5,2
...,...,...,...,...,...
3785,2016 Summer,"Virgin Islands, US",0,0,0
3786,2016 Summer,Yemen,0,0,0
3787,2016 Summer,Zambia,0,0,0
3788,2016 Summer,Zimbabwe,0,0,0


## 16. Identify which country won the most gold, most silver and most bronze medals in each olympic games.

In [20]:
duckdb.query("""
            WITH t1 AS (SELECT distinct EVENTS.Games, REGION.Region as Region,
                        COUNT (CASE WHEN EVENTS.Medal = 'Gold'   THEN 1 END) AS Gold,
                        COUNT (CASE WHEN EVENTS.Medal = 'Silver' THEN 1 END) AS Silver,
                        COUNT (CASE WHEN EVENTS.Medal = 'Bronze' THEN 1 END) AS Bronze,
                        FROM athlete_events_df as EVENTS
                        JOIN regions_df as REGION
                        ON REGION.NOC = EVENTS.NOC
                        GROUP BY Games, Region
                        ORDER BY Games, Gold, Silver, Bronze)

            SELECT distinct Games, 

            CONCAT((FIRST_VALUE(Region) OVER(PARTITION by Games order by Gold DESC)),'-',
            (FIRST_VALUE(Gold) OVER(partition by Games order by Gold desc))) As Max_Gold,

            CONCAT((FIRST_VALUE(Region) OVER(PARTITION by Games order by Silver DESC)),'-',
            (FIRST_VALUE(Silver) OVER(PARTITION by Games order by Silver desc))) As Max_Silver,

            CONCAT((FIRST_VALUE(Region) OVER(PARTITION by Games order by Bronze desc)),'-',
            (FIRST_VALUE(Bronze) OVER(PARTITION by Games order by Bronze desc))) As Max_Bronze,           

            FROM t1
            ORDER BY Games;
""").df()

Unnamed: 0,Games,Max_Gold,Max_Silver,Max_Bronze
0,1896 Summer,Germany-25,Greece-18,Greece-20
1,1900 Summer,UK-59,France-101,France-82
2,1904 Summer,USA-128,USA-141,USA-125
3,1906 Summer,Greece-24,Greece-48,Greece-30
4,1908 Summer,UK-147,UK-131,UK-90
5,1912 Summer,Sweden-103,UK-64,UK-59
6,1920 Summer,USA-111,France-71,Belgium-66
7,1924 Summer,USA-97,France-51,USA-49
8,1924 Winter,UK-16,USA-10,UK-11
9,1928 Summer,USA-47,Netherlands-29,Germany-41


## 17. Identify which country won the most gold, most silver, most bronze medals and the most medals in each olympic games.

In [21]:
duckdb.query("""
            WITH t1 AS (SELECT distinct EVENTS.Games, REGION.Region as Region,
                        COUNT (CASE WHEN EVENTS.Medal = 'Gold'   THEN 1 END) AS Gold,
                        COUNT (CASE WHEN EVENTS.Medal = 'Silver' THEN 1 END) AS Silver,
                        COUNT (CASE WHEN EVENTS.Medal = 'Bronze' THEN 1 END) AS Bronze,
                        SUM   (CASE WHEN MEDAL<>'NA'then 1 ELSE 0 end) as Total_Medals
                        FROM athlete_events_df as EVENTS
                        JOIN regions_df as REGION
                        ON REGION.NOC = EVENTS.NOC
                        GROUP BY Games, Region
                        ORDER BY Games, Gold, Silver, Bronze)

            SELECT DISTINCT Games, 

            CONCAT((FIRST_VALUE(Region) OVER(PARTITION by Games order by Gold DESC)),'-',
            (FIRST_VALUE(Gold) OVER(partition by Games order by Gold desc))) As Max_Gold,

            CONCAT((FIRST_VALUE(Region) OVER(PARTITION by Games order by Silver DESC)),'-',
            (FIRST_VALUE(Silver) OVER(PARTITION by Games order by Silver desc))) As Max_Silver,

            CONCAT((FIRST_VALUE(Region) OVER(PARTITION by Games order by Bronze desc)),'-',
            (FIRST_VALUE(Bronze) OVER(PARTITION by Games order by Bronze desc))) As Max_Bronze,           

            Concat((FIRST_VALUE(Region) OVER(PARTITION BY GAMES ORDER BY Total_Medals DESC)),'-',
            (FIRST_VALUE(Total_Medals) OVER(PARTITION BY GAMES ORDER BY Total_Medals DESC))) As Max_Medals

            FROM t1
            ORDER BY Games;
""").df()

Unnamed: 0,Games,Max_Gold,Max_Silver,Max_Bronze,Max_Medals
0,1896 Summer,Germany-25,Greece-18,Greece-20,Greece-48
1,1900 Summer,UK-59,France-101,France-82,France-235
2,1904 Summer,USA-128,USA-141,USA-125,USA-394
3,1906 Summer,Greece-24,Greece-48,Greece-30,Greece-102
4,1908 Summer,UK-147,UK-131,UK-90,UK-368
5,1912 Summer,Sweden-103,UK-64,UK-59,Sweden-190
6,1920 Summer,USA-111,France-71,Belgium-66,USA-194
7,1924 Summer,USA-97,France-51,USA-49,USA-182
8,1924 Winter,UK-16,USA-10,UK-11,UK-31
9,1928 Summer,USA-47,Netherlands-29,Germany-41,USA-88


## 18. Which countries have never won gold medal but have won silver/bronze medals?

In [22]:
duckdb.query("""
            SELECT Region AS Region,
            COUNT (CASE WHEN EVENTS.Medal = 'Gold' THEN 1 END) AS Gold,
            COUNT (CASE WHEN EVENTS.Medal = 'Silver' THEN 1 END) AS Silver,
            COUNT (CASE WHEN EVENTS.Medal = 'Bronze' THEN 1 END) AS Bronze,

            FROM athlete_events_df AS EVENTS
            JOIN regions_df AS REGIONS
            ON REGIONS.NOC = EVENTS.NOC

            GROUP BY Region
            HAVING Gold = 0 
            ORDER BY Silver DESC  
""").df()

Unnamed: 0,Region,Gold,Silver,Bronze
0,Paraguay,0,17,0
1,Iceland,0,15,2
2,Montenegro,0,14,0
3,Malaysia,0,11,5
4,Namibia,0,4,0
...,...,...,...,...
102,Brunei,0,0,0
103,Micronesia,0,0,0
104,South Sudan,0,0,0
105,Equatorial Guinea,0,0,0


## 19. In which Sport/event, India has won highest medals.

In [23]:
duckdb.query("""
            SELECT REGIONS.Region AS Region, EVENTS.Sport, COUNT(EVENTS.Medal) AS Total_medals, 

            FROM athlete_events_df AS EVENTS
            JOIN regions_df AS REGIONS
            ON REGIONS.NOC = EVENTS.NOC

            GROUP BY Region, Sport
            HAVING Region = 'India'
            ORDER BY Total_medals DESC 
            LIMIT 1                       
""").df()

Unnamed: 0,Region,Sport,Total_medals
0,India,Hockey,173


## 20. Break down all olympic games where India won medal for Hockey and how many medals in each olympic games

In [24]:
duckdb.query("""
            SELECT REGIONS.Region AS Region, EVENTS.Sport, EVENTS.Games, COUNT(EVENTS.Medal) AS Total_medals, 

            FROM athlete_events_df AS EVENTS
            JOIN regions_df AS REGIONS
            ON REGIONS.NOC = EVENTS.NOC

            GROUP BY Region, Sport, Games
            HAVING Region = 'India' AND Sport = 'Hockey' AND Total_medals != 0
            ORDER BY Total_medals DESC                        
""").df()

Unnamed: 0,Region,Sport,Games,Total_medals
0,India,Hockey,1948 Summer,20
1,India,Hockey,1936 Summer,19
2,India,Hockey,1956 Summer,17
3,India,Hockey,1968 Summer,16
4,India,Hockey,1980 Summer,16
5,India,Hockey,1964 Summer,15
6,India,Hockey,1932 Summer,15
7,India,Hockey,1972 Summer,14
8,India,Hockey,1952 Summer,14
9,India,Hockey,1928 Summer,14
