# 1- Using Pandas

### Importing pandas

In [1]:
import pandas as pd

# 2 - Series and DataFrames

In [2]:
df = pd.read_csv('olympics.csv', skiprows=4)

import numpy as np
np.random.seed(824)
df['age'] = (5*np.random.randn(df.shape[0]) + 25).astype(int)
df['height'] = (5*np.random.randn(df.shape[0]) + 175).astype(int)

### Accessing the DataFrame

In [3]:
df.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,17,176
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,22,172
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver,25,173


### Accessing Series

In [4]:
df.Athlete.head()

0         HAJOS, Alfred
1      HERSCHMANN, Otto
2     DRIVAS, Dimitrios
3    MALOKINIS, Ioannis
4    CHASAPIS, Spiridon
Name: Athlete, dtype: object

In [5]:
df['Athlete'].head()

0         HAJOS, Alfred
1      HERSCHMANN, Otto
2     DRIVAS, Dimitrios
3    MALOKINIS, Ioannis
4    CHASAPIS, Spiridon
Name: Athlete, dtype: object

In [6]:
df[['City','Edition','Athlete']].head()

Unnamed: 0,City,Edition,Athlete
0,Athens,1896,"HAJOS, Alfred"
1,Athens,1896,"HERSCHMANN, Otto"
2,Athens,1896,"DRIVAS, Dimitrios"
3,Athens,1896,"MALOKINIS, Ioannis"
4,Athens,1896,"CHASAPIS, Spiridon"


In [7]:
type(df)

pandas.core.frame.DataFrame

In [8]:
type(df['City'])

pandas.core.series.Series

In [9]:
type(df.City)

pandas.core.series.Series

In [10]:
type(df[['City','Edition','Athlete']])

pandas.core.frame.DataFrame

# 3 - Overview and validation of data
### Shape

In [11]:
df.shape

(29216, 12)

In [12]:
df.shape[0]

29216

### head() and tail()

In [13]:
df.head(3)

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,17,176
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,22,172
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,21,180


In [14]:
df.tail()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
29211,Beijing,2008,Wrestling,Wrestling Gre-R,"ENGLICH, Mirko",GER,Men,84 - 96kg,M,Silver,26,180
29212,Beijing,2008,Wrestling,Wrestling Gre-R,"MIZGAITIS, Mindaugas",LTU,Men,96 - 120kg,M,Bronze,14,158
29213,Beijing,2008,Wrestling,Wrestling Gre-R,"PATRIKEEV, Yuri",ARM,Men,96 - 120kg,M,Bronze,16,170
29214,Beijing,2008,Wrestling,Wrestling Gre-R,"LOPEZ, Mijain",CUB,Men,96 - 120kg,M,Gold,21,171
29215,Beijing,2008,Wrestling,Wrestling Gre-R,"BAROEV, Khasan",RUS,Men,96 - 120kg,M,Silver,24,168


### info()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29216 entries, 0 to 29215
Data columns (total 12 columns):
City            29216 non-null object
Edition         29216 non-null int64
Sport           29216 non-null object
Discipline      29216 non-null object
Athlete         29216 non-null object
NOC             29216 non-null object
Gender          29216 non-null object
Event           29216 non-null object
Event_gender    29216 non-null object
Medal           29216 non-null object
age             29216 non-null int64
height          29216 non-null int64
dtypes: int64(3), object(9)
memory usage: 2.7+ MB


### describe()

In [16]:
df.describe()

Unnamed: 0,Edition,age,height
count,29216.0,29216.0,29216.0
mean,1967.713171,24.499179,174.538506
std,32.406293,5.014502,5.004713
min,1896.0,5.0,153.0
25%,1948.0,21.0,171.0
50%,1976.0,25.0,175.0
75%,1996.0,28.0,178.0
max,2008.0,44.0,195.0


# 4 - Basic Analysis

In [17]:
df = pd.read_csv('olympics.csv', skiprows=4)
np.random.seed(824)
df['age'] = (5*np.random.randn(df.shape[0]) + 25).astype(int)
df['height'] = (5*np.random.randn(df.shape[0]) + 175).astype(int)
df.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,17,176
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,22,172
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver,25,173


### value_counts()

In [18]:
df.Edition.value_counts().head()

2008    2042
2000    2015
2004    1998
1996    1859
1992    1705
Name: Edition, dtype: int64

In [19]:
df.Gender.value_counts(ascending=True)

Women     7495
Men      21721
Name: Gender, dtype: int64

### sort_values()

In [20]:
ath = df.Athlete.sort_values()
ath.head()

651                 AABYE, Edgar
2849       AALTONEN, Arvo Ossian
2852       AALTONEN, Arvo Ossian
7716    AALTONEN, Paavo Johannes
7730    AALTONEN, Paavo Johannes
Name: Athlete, dtype: object

In [21]:
ath = df.Athlete.sort_values(ascending=False)
ath.head()

8051    ÖSTRAND, Per-Olof
596            ÖSTMO, Ole
621            ÖSTMO, Ole
608            ÖSTMO, Ole
603            ÖSTMO, Ole
Name: Athlete, dtype: object

In [22]:
df1 = df.sort_values(by=['Edition','age'], ascending=[True,False])
df1.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
114,Athens,1896,Gymnastics,Artistic G.,"WEINGÄRTNER, Hermann",GER,Men,vault,M,Bronze,40,174
13,Athens,1896,Athletics,Athletics,"BURKE, Thomas",USA,Men,100m,M,Gold,38,173
9,Athens,1896,Aquatics,Swimming,"NEUMANN, Paul",AUT,Men,400m freestyle,M,Gold,35,181
43,Athens,1896,Athletics,Athletics,"GARRETT, Robert",USA,Men,shot put,M,Gold,35,170
120,Athens,1896,Shooting,Shooting,"NIELSEN, Holger",DEN,Men,25m rapid fire pistol (60 shots),M,Bronze,35,175


### drop_duplicates()

In [23]:
df1.drop_duplicates(subset='Edition', keep='first', inplace=True)

In [24]:
df1

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
114,Athens,1896,Gymnastics,Artistic G.,"WEINGÄRTNER, Hermann",GER,Men,vault,M,Bronze,40,174
338,Paris,1900,Croquet,Croquet,JOHIN,FRA,Men,individual 1 ball,M,Silver,38,165
762,St Louis,1904,Athletics,Athletics,"PILGRIM, Paul Harry",USA,Men,4miles team,M,Gold,42,170
1768,London,1908,Shooting,Shooting,"CARNELL, Arthur Ashton",GBR,Men,50m rifle prone (60 shots),M,Gold,40,171
2236,Stockholm,1912,Fencing,Fencing,"GOLLING, Friedrich",AUT,Men,sabre team,M,Silver,38,177
2918,Antwerp,1920,Archery,Archery,"CLOETENS, Edmond",BEL,Men,"fixed bird target small bird, teams",M,Gold,40,175
4853,Paris,1924,Shooting,Shooting,"LIBERG, Einar",NOR,Men,"100m running deer, double shots, team",M,Silver,39,177
5137,Amsterdam,1928,Athletics,Athletics,"THOMPSON, Jean",CAN,Women,4x100m relay,W,Gold,43,181
6265,Los Angeles,1932,Sailing,Sailing,"RATSEY, George Colin",GBR,Men,two-person keelboat open (Star),X,Silver,39,166
6514,Berlin,1936,Athletics,Athletics,"CSAK, Ibolya",HUN,Women,high jump,W,Gold,40,177


### Boolean indexing

In [25]:
df[(df.Medal=='Gold') & (df.City=='Beijing') & ((df.Event=='100m')|(df.Event=='200m'))]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
27552,Beijing,2008,Athletics,Athletics,"BOLT, Usain",JAM,Men,100m,M,Gold,28,167
27554,Beijing,2008,Athletics,Athletics,"FRASER, Shelly-ann",JAM,Women,100m,W,Gold,20,180
27570,Beijing,2008,Athletics,Athletics,"BOLT, Usain",JAM,Men,200m,M,Gold,24,177
27573,Beijing,2008,Athletics,Athletics,"CAMPBELL-BROWN, Veronica",JAM,Women,200m,W,Gold,28,181


### String handling

In [26]:
df[df.Athlete.str.contains('Usain')]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
27552,Beijing,2008,Athletics,Athletics,"BOLT, Usain",JAM,Men,100m,M,Gold,28,167
27570,Beijing,2008,Athletics,Athletics,"BOLT, Usain",JAM,Men,200m,M,Gold,24,177
27603,Beijing,2008,Athletics,Athletics,"BOLT, Usain",JAM,Men,4x100m relay,M,Gold,32,179


# 5 - Indexing

In [27]:
df = pd.read_csv('olympics.csv', skiprows=4)
np.random.seed(824)
df['age'] = (5*np.random.randn(df.shape[0]) + 25).astype(int)
df['height'] = (5*np.random.randn(df.shape[0]) + 175).astype(int)
df.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,17,176
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,22,172
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver,25,173


### Index

In [28]:
type(df.index)

pandas.core.indexes.range.RangeIndex

In [29]:
df.index[100]

100

### set_index() and reset_index()

In [30]:
df.set_index('Athlete').head()

Unnamed: 0_level_0,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"HAJOS, Alfred",Athens,1896,Aquatics,Swimming,HUN,Men,100m freestyle,M,Gold,17,176
"HERSCHMANN, Otto",Athens,1896,Aquatics,Swimming,AUT,Men,100m freestyle,M,Silver,22,172
"DRIVAS, Dimitrios",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Bronze,21,180
"MALOKINIS, Ioannis",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Gold,32,179
"CHASAPIS, Spiridon",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Silver,25,173


In [31]:
df.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,17,176
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,22,172
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver,25,173


In [32]:
df.set_index('Athlete',inplace=True)

In [33]:
df.head()

Unnamed: 0_level_0,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"HAJOS, Alfred",Athens,1896,Aquatics,Swimming,HUN,Men,100m freestyle,M,Gold,17,176
"HERSCHMANN, Otto",Athens,1896,Aquatics,Swimming,AUT,Men,100m freestyle,M,Silver,22,172
"DRIVAS, Dimitrios",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Bronze,21,180
"MALOKINIS, Ioannis",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Gold,32,179
"CHASAPIS, Spiridon",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Silver,25,173


In [34]:
df.reset_index(inplace=True)

In [35]:
df.head()

Unnamed: 0,Athlete,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
0,"HAJOS, Alfred",Athens,1896,Aquatics,Swimming,HUN,Men,100m freestyle,M,Gold,17,176
1,"HERSCHMANN, Otto",Athens,1896,Aquatics,Swimming,AUT,Men,100m freestyle,M,Silver,22,172
2,"DRIVAS, Dimitrios",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,"MALOKINIS, Ioannis",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,"CHASAPIS, Spiridon",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Silver,25,173


### sort_index()

In [36]:
df1 = df.set_index('Athlete')
df1.head()

Unnamed: 0_level_0,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"HAJOS, Alfred",Athens,1896,Aquatics,Swimming,HUN,Men,100m freestyle,M,Gold,17,176
"HERSCHMANN, Otto",Athens,1896,Aquatics,Swimming,AUT,Men,100m freestyle,M,Silver,22,172
"DRIVAS, Dimitrios",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Bronze,21,180
"MALOKINIS, Ioannis",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Gold,32,179
"CHASAPIS, Spiridon",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Silver,25,173


In [37]:
df1.sort_index(inplace=True,ascending=False)
df1.head()

Unnamed: 0_level_0,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"ÖSTRAND, Per-Olof",Helsinki,1952,Aquatics,Swimming,SWE,Men,400m freestyle,M,Bronze,22,175
"ÖSTMO, Ole",Paris,1900,Shooting,Shooting,NOR,Men,"free rifle, team",M,Silver,14,176
"ÖSTMO, Ole",Paris,1900,Shooting,Shooting,NOR,Men,"army rifle, 300m, standing",M,Silver,23,172
"ÖSTMO, Ole",Paris,1900,Shooting,Shooting,NOR,Men,"army rifle, 300m, prone",M,Bronze,16,177
"ÖSTMO, Ole",Paris,1900,Shooting,Shooting,NOR,Men,"army rifle, 300m, 3 positions",M,Bronze,29,178


### loc[...]

In [38]:
df1.loc['BOLT, Usain']

Unnamed: 0_level_0,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"BOLT, Usain",Beijing,2008,Athletics,Athletics,JAM,Men,4x100m relay,M,Gold,32,179
"BOLT, Usain",Beijing,2008,Athletics,Athletics,JAM,Men,200m,M,Gold,24,177
"BOLT, Usain",Beijing,2008,Athletics,Athletics,JAM,Men,100m,M,Gold,28,167


In [39]:
df.head()

Unnamed: 0,Athlete,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
0,"HAJOS, Alfred",Athens,1896,Aquatics,Swimming,HUN,Men,100m freestyle,M,Gold,17,176
1,"HERSCHMANN, Otto",Athens,1896,Aquatics,Swimming,AUT,Men,100m freestyle,M,Silver,22,172
2,"DRIVAS, Dimitrios",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,"MALOKINIS, Ioannis",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,"CHASAPIS, Spiridon",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Silver,25,173


In [40]:
df.loc[df.Athlete == 'BOLT, Usain']

Unnamed: 0,Athlete,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
27552,"BOLT, Usain",Beijing,2008,Athletics,Athletics,JAM,Men,100m,M,Gold,28,167
27570,"BOLT, Usain",Beijing,2008,Athletics,Athletics,JAM,Men,200m,M,Gold,24,177
27603,"BOLT, Usain",Beijing,2008,Athletics,Athletics,JAM,Men,4x100m relay,M,Gold,32,179


### iloc[...]

In [41]:
df.iloc[1700]

Athlete         RABOT, Pierre
City                   London
Edition                  1908
Sport                 Sailing
Discipline            Sailing
NOC                       FRA
Gender                    Men
Event                      6m
Event_gender                X
Medal                  Bronze
age                        21
height                    180
Name: 1700, dtype: object

In [42]:
df.iloc[[1542, 2390,6000,15000]]

Unnamed: 0,Athlete,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
1542,"DUCKETT, Richard Louis",London,1908,Lacrosse,Lacrosse,CAN,Men,lacrosse,M,Gold,24,170
2390,"SAASTAMOINEN, Eino",Stockholm,1912,Gymnastics,Artistic G.,FIN,Men,"team, free system",M,Silver,23,173
6000,"AGOSTONI, Carlo",Los Angeles,1932,Fencing,Fencing,ITA,Men,épée individual,M,Bronze,14,173
15000,"JENSEN, Poul Richard Hoj",Montreal,1976,Sailing,Sailing,DEN,Men,fleet/match race keelboat open (Soling),X,Gold,26,171


In [43]:
df.head()

Unnamed: 0,Athlete,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
0,"HAJOS, Alfred",Athens,1896,Aquatics,Swimming,HUN,Men,100m freestyle,M,Gold,17,176
1,"HERSCHMANN, Otto",Athens,1896,Aquatics,Swimming,AUT,Men,100m freestyle,M,Silver,22,172
2,"DRIVAS, Dimitrios",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,"MALOKINIS, Ioannis",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,"CHASAPIS, Spiridon",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Silver,25,173


In [44]:
df.iloc[1:4]

Unnamed: 0,Athlete,City,Edition,Sport,Discipline,NOC,Gender,Event,Event_gender,Medal,age,height
1,"HERSCHMANN, Otto",Athens,1896,Aquatics,Swimming,AUT,Men,100m freestyle,M,Silver,22,172
2,"DRIVAS, Dimitrios",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,"MALOKINIS, Ioannis",Athens,1896,Aquatics,Swimming,GRE,Men,100m freestyle for sailors,M,Gold,32,179


In [45]:
df.iloc[1:4,0:4]

Unnamed: 0,Athlete,City,Edition,Sport
1,"HERSCHMANN, Otto",Athens,1896,Aquatics
2,"DRIVAS, Dimitrios",Athens,1896,Aquatics
3,"MALOKINIS, Ioannis",Athens,1896,Aquatics


# 6 - Groupby

In [46]:
df = pd.read_csv('olympics.csv', skiprows=4)
np.random.seed(824)
df['age'] = (5*np.random.randn(df.shape[0]) + 25).astype(int)
df['height'] = (5*np.random.randn(df.shape[0]) + 175).astype(int)
df.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,17,176
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,22,172
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver,25,173


### The Groupby object

In [47]:
df.groupby('Edition')

<pandas.core.groupby.DataFrameGroupBy object at 0x110496c88>

In [48]:
type(df.groupby('Edition'))

pandas.core.groupby.DataFrameGroupBy

### Groupby computations

** size() **

In [49]:
df.groupby('Edition').size().head()

Edition
1896    151
1900    512
1904    470
1908    804
1912    885
dtype: int64

In [50]:
df.groupby(['Edition','NOC','Medal']).size().head()

Edition  NOC  Medal 
1896     AUS  Gold      2
         AUT  Bronze    2
              Gold      2
              Silver    1
         DEN  Bronze    3
dtype: int64

** agg({  : [ ... ]}) **

In [51]:
df1 = df.groupby(['Edition','NOC','Medal']).agg({'age':['min','max','count','mean']}).head()
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,age,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max,count,mean
Edition,NOC,Medal,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1896,AUS,Gold,17,18,2,17.5
1896,AUT,Bronze,28,30,2,29.0
1896,AUT,Gold,19,35,2,27.0
1896,AUT,Silver,22,22,1,22.0
1896,DEN,Bronze,24,35,3,28.666667


# 7 - Reshaping

In [52]:
df = pd.read_csv('olympics.csv', skiprows=4)

np.random.seed(824)
df['age'] = (5*np.random.randn(df.shape[0]) + 25).astype(int)
df['height'] = (5*np.random.randn(df.shape[0]) + 175).astype(int)

df.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,17,176
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,22,172
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,21,180
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,32,179
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver,25,173


### stack() and unstack()

In [53]:
df1 = df[(df.Edition == 2008) & (df.Gender=='Men') & ((df.Event == '100m')|(df.Event == '200m'))]
df1

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,age,height
27551,Beijing,2008,Athletics,Athletics,"DIX, Walter",USA,Men,100m,M,Bronze,26,174
27552,Beijing,2008,Athletics,Athletics,"BOLT, Usain",JAM,Men,100m,M,Gold,28,167
27553,Beijing,2008,Athletics,Athletics,"THOMPSON, Richard",TRI,Men,100m,M,Silver,31,179
27569,Beijing,2008,Athletics,Athletics,"DIX, Walter",USA,Men,200m,M,Bronze,22,172
27570,Beijing,2008,Athletics,Athletics,"BOLT, Usain",JAM,Men,200m,M,Gold,24,177
27571,Beijing,2008,Athletics,Athletics,"CRAWFORD, Shawn",USA,Men,200m,M,Silver,18,174


In [54]:
df2 = df1.groupby(['NOC','Gender','Event']).size()
df2

NOC  Gender  Event
JAM  Men     100m     1
             200m     1
TRI  Men     100m     1
USA  Men     100m     1
             200m     2
dtype: int64

In [55]:
df3 = df2.unstack(['Event'])
df3

Unnamed: 0_level_0,Event,100m,200m
NOC,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1
JAM,Men,1.0,1.0
TRI,Men,1.0,
USA,Men,1.0,2.0


In [56]:
df3 = df2.unstack(['Event'], fill_value=0)
df3

Unnamed: 0_level_0,Event,100m,200m
NOC,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1
JAM,Men,1,1
TRI,Men,1,0
USA,Men,1,2


In [57]:
df3.index

MultiIndex(levels=[['JAM', 'TRI', 'USA'], ['Men']],
           labels=[[0, 1, 2], [0, 0, 0]],
           names=['NOC', 'Gender'])

In [58]:
df3.columns

Index(['100m', '200m'], dtype='object', name='Event')

In [59]:
df4 = df3.copy()

In [60]:
df4.columns.name

'Event'

In [61]:
df4.columns.name = None

In [62]:
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,100m,200m
NOC,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1
JAM,Men,1,1
TRI,Men,1,0
USA,Men,1,2


In [63]:
df3

Unnamed: 0_level_0,Event,100m,200m
NOC,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1
JAM,Men,1,1
TRI,Men,1,0
USA,Men,1,2


In [64]:
df3.stack()

NOC  Gender  Event
JAM  Men     100m     1
             200m     1
TRI  Men     100m     1
             200m     0
USA  Men     100m     1
             200m     2
dtype: int64