## Load Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

In [2]:
nobel_data = pd.read_csv("../input/nobel-laureates/archive.csv")

## EDA

#### Looking at first several winners

In [3]:
nobel_data.head()

#### Number of Nobel prizes between 1901 - 2016

In [4]:
print(f"There are {len(nobel_data)} Nobel prizes handed out between 1901 and 2016")

#### Count Nobel prize winners by Male and Female

In [5]:
sns.countplot(x='Sex', data=nobel_data, palette="Set3")
plt.title("Nobel Prize Winners by Gender")
plt.show()

In [6]:
# Make a new column containing Decade
nobel_data["Decade"] = (np.floor(nobel_data['Year']/10) * 10).astype(int)

# Make a new column containing Gender equal to Female
nobel_data["Female Winner"] = nobel_data["Sex"] == "Female"

# Calculating the proportion of female laureates per decade
prop_female = nobel_data.groupby("Decade", as_index = False)["Female Winner"].mean()

In [7]:
# Set seaborn plotting theme
sns.set()
# Set plotting figure size
plt.figure(figsize=[12,6])

ax = sns.lineplot(x="Decade",y="Female Winner", data=prop_female)
ax.yaxis.set_major_formatter(PercentFormatter(1.0))
plt.title("The proportion of Female Laureates each Decade")
plt.show()
plt.savefig("proportion_female.png")

In [8]:
# First women to win the Nobel Prize
nobel_data[nobel_data["Sex"] == "Female"].nsmallest(n=1,columns="Year")

#### Top 5 Nobel Prizes winners by Nationalities

In [9]:
sns.countplot(y="Birth Country", data=nobel_data, order = nobel_data["Birth Country"].value_counts().iloc[:5].index)
plt.title("Top 5 Nobel Prize Winners by Nationality")
plt.show()
plt.savefig("top_5_nationalities.png")

#### Looking at US Dominance per decade

In [10]:
# Make a new column containing birth in US or not
nobel_data["US Born"] = nobel_data["Birth Country"] == "United States of America"

# US proportions win
prop_us = nobel_data.groupby(["Decade", "Category"], as_index=False)["US Born"].mean()

In [11]:
plt.figure(figsize=[12, 6])
ax = sns.lineplot(x="Decade", y="US Born", hue="Category", data=prop_us)
ax.yaxis.set_major_formatter(PercentFormatter(1.0))
plt.title("The proportion of US win for each Decade")
plt.show()
plt.savefig('proportion_us.png')

#### Looking at laurates that have received more than 1 prizes

In [12]:
nobel_data.groupby("Full Name").filter(lambda x: len(x) > 1)

In [13]:
nobel_data.info()

#### Age vs Year

In [14]:
# Converting to datetime
nobel_data['Birth Date'] = pd.to_datetime(nobel_data['Birth Date'], errors = "coerce")

In [15]:
# Calculating the age of Nobel Prize winners
nobel_data['Age'] = nobel_data['Year'] - nobel_data['Birth Date'].dt.year

In [19]:
sns.lmplot(x="Year", y="Age", data=nobel_data, aspect = 2, line_kws={'color':'black'}, lowess=True)
plt.title("Age Distribution")
plt.show()
plt.savefig('age_distibution.png')

In [17]:
# The oldest winner
nobel_data.nlargest(n=1, columns="Age")

In [18]:
# The Youngest winner
nobel_data.nsmallest(n=1, columns="Age")