In [1]:
import pandas as pd
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

## Extract CSVs into DataFrames

In [2]:
olympic_history = "./athlete_events.csv"
o_history_df = pd.read_csv(olympic_history)
o_history_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [3]:
noc_regions = "./noc_regions.csv"
regions_df = pd.read_csv(noc_regions)
regions_df.head()

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,


In [4]:
reg_cols = ["NOC", "region"]
reg_cat_df = regions_df[reg_cols]
reg_cat_df.head()

Unnamed: 0,NOC,region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


## Transform Olympic DataFrame

In [5]:
reg_cols = ["NOC", "region"]
reg_cat_df = regions_df[reg_cols]
reg_cat_df.head()

Unnamed: 0,NOC,region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


## Count how much data

In [5]:
reg_cat_df.count()

NOC       230
region    227
dtype: int64

## Drop any N/A rows

In [7]:
no_null_reg_cat_df.count()

NOC       227
region    227
dtype: int64

In [6]:
no_null_reg_cat_df = reg_cat_df.dropna(how='any')
no_null_reg_cat_df.head()

Unnamed: 0,NOC,region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


In [8]:
noc_df = no_null_reg_cat_df
noc_df.head()

Unnamed: 0,NOC,region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


## Transform Olympic DataFrame

In [9]:
cols = ["Team","Games", "Season", "City", "Medal", "Sport", "Event"]
olympic_cat_df = o_history_df[cols]
olympic_cat_df.head()


Unnamed: 0,Team,Games,Season,City,Medal,Sport,Event
0,China,1992 Summer,Summer,Barcelona,,Basketball,Basketball Men's Basketball
1,China,2012 Summer,Summer,London,,Judo,Judo Men's Extra-Lightweight
2,Denmark,1920 Summer,Summer,Antwerpen,,Football,Football Men's Football
3,Denmark/Sweden,1900 Summer,Summer,Paris,Gold,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
4,Netherlands,1988 Winter,Winter,Calgary,,Speed Skating,Speed Skating Women's 500 metres


In [10]:
olympic_cat_df.count()

Team      271116
Games     271116
Season    271116
City      271116
Medal      39783
Sport     271116
Event     271116
dtype: int64

## Drop any N/A rows

In [11]:
no_null_olympic_cat_df = olympic_cat_df.dropna(how='any')
no_null_olympic_cat_df.head()

Unnamed: 0,Team,Games,Season,City,Medal,Sport,Event
3,Denmark/Sweden,1900 Summer,Summer,Paris,Gold,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
37,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 200 metres Breaststroke
38,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 400 metres Breaststroke
40,Finland,2014 Winter,Winter,Sochi,Bronze,Ice Hockey,Ice Hockey Men's Ice Hockey
41,Finland,1948 Summer,Summer,London,Bronze,Gymnastics,Gymnastics Men's Individual All-Around


In [12]:
no_null_olympic_cat_df["Medal"].value_counts()

Gold      13372
Bronze    13295
Silver    13116
Name: Medal, dtype: int64

In [13]:
new_data_df = no_null_olympic_cat_df
new_data_df.head()

Unnamed: 0,Team,Games,Season,City,Medal,Sport,Event
3,Denmark/Sweden,1900 Summer,Summer,Paris,Gold,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
37,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 200 metres Breaststroke
38,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 400 metres Breaststroke
40,Finland,2014 Winter,Winter,Sochi,Bronze,Ice Hockey,Ice Hockey Men's Ice Hockey
41,Finland,1948 Summer,Summer,London,Bronze,Gymnastics,Gymnastics Men's Individual All-Around


In [14]:
new_data_df["Sport"].unique()

array(['Tug-Of-War', 'Swimming', 'Ice Hockey', 'Gymnastics',
       'Alpine Skiing', 'Handball', 'Hockey', 'Rowing', 'Football',
       'Speed Skating', 'Sailing', 'Cycling', 'Fencing', 'Taekwondo',
       'Athletics', 'Canoeing', 'Water Polo', 'Wrestling',
       'Modern Pentathlon', 'Figure Skating', 'Golf', 'Softball',
       'Boxing', 'Basketball', 'Nordic Combined', 'Diving', 'Baseball',
       'Volleyball', 'Cross Country Skiing', 'Bobsleigh', 'Curling',
       'Shooting', 'Judo', 'Equestrianism', 'Tennis', 'Rugby Sevens',
       'Rhythmic Gymnastics', 'Weightlifting', 'Badminton',
       'Beach Volleyball', 'Ski Jumping', 'Rugby',
       'Short Track Speed Skating', 'Biathlon', 'Lacrosse',
       'Synchronized Swimming', 'Archery', 'Freestyle Skiing',
       'Triathlon', 'Polo', 'Luge', 'Table Tennis', 'Snowboarding',
       'Cricket', 'Skeleton', 'Racquets', 'Military Ski Patrol',
       'Croquet', 'Art Competitions', 'Roque', 'Alpinism', 'Trampolining',
       'Basque Pelota',

## Clean Dataframe for selected Columns

In [15]:
sports = ["Swimming", "Speed Skating"]
new_data_df.loc[new_data_df["Sport"].isin(sports)]

Unnamed: 0,Team,Games,Season,City,Medal,Sport,Event
37,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 200 metres Breaststroke
38,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 400 metres Breaststroke
110,Norway,1952 Winter,Winter,Oslo,Bronze,Speed Skating,"Speed Skating Men's 1,500 metres"
113,Norway,1960 Winter,Winter,Squaw Valley,Gold,Speed Skating,"Speed Skating Men's 1,500 metres"
210,Hungary,1936 Summer,Summer,Berlin,Bronze,Swimming,Swimming Men's 4 x 200 metres Freestyle Relay
476,Canada,1984 Summer,Summer,Los Angeles,Bronze,Swimming,Swimming Women's 4 x 100 metres Medley Relay
740,South Africa,1956 Summer,Summer,Melbourne,Bronze,Swimming,Swimming Women's 4 x 100 metres Freestyle Relay
803,Soviet Union,1972 Summer,Summer,Munich,Silver,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay
804,Soviet Union,1972 Summer,Summer,Munich,Bronze,Swimming,Swimming Men's 4 x 200 metres Freestyle Relay
814,Australia,2016 Summer,Summer,Rio de Janeiro,Bronze,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay


## Create Database Connection

In [16]:
conn_string = "mysql://root:CANCER#72@localhost:3306/olympic_db"
engine = create_engine(conn_string)

## Check for tables

In [17]:
engine.table_names()

['region', 'team']

## Check for length of event for database

In [18]:
event_list = [len(new_data_df.iloc[x,6]) for x in range(len(new_data_df))]
# event_list.index(max(event_list))

In [19]:
max(event_list)

85

In [20]:
new_data_df.iloc[23684,6]

"Shooting Men's Military Rifle, 1873-1874 Gras Model, Kneeling Or Standing, 200 metres"

## Use pandas to load csv converted DataFrame into database

In [21]:
new_data_df.to_sql(name="team", con=engine, 
                   if_exists='append', index=False)

In [22]:
noc_df.to_sql(name="region", con=engine,
                   if_exists='append', index=False)

## Confirm data has been added by querying the team table

In [23]:
pd.read_sql_query('select * from team', con=engine).head()

Unnamed: 0,id,team,games,season,city,medal,sport,event
0,1,Denmark/Sweden,1900 Summer,Summer,Paris,Gold,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
1,2,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 200 metres Breaststroke
2,3,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 400 metres Breaststroke
3,4,Finland,2014 Winter,Winter,Sochi,Bronze,Ice Hockey,Ice Hockey Men's Ice Hockey
4,5,Finland,1948 Summer,Summer,London,Bronze,Gymnastics,Gymnastics Men's Individual All-Around


## Confirm data had been added by querying the region table

In [6]:
olympic_cat_df.count()

Team      271116
Games     271116
Season    271116
City      271116
Medal      39783
Sport     271116
Event     271116
dtype: int64

In [7]:
no_null_olympic_cat_df = olympic_cat_df.dropna(how='any')
no_null_olympic_cat_df.head()

Unnamed: 0,Team,Games,Season,City,Medal,Sport,Event
3,Denmark/Sweden,1900 Summer,Summer,Paris,Gold,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
37,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 200 metres Breaststroke
38,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 400 metres Breaststroke
40,Finland,2014 Winter,Winter,Sochi,Bronze,Ice Hockey,Ice Hockey Men's Ice Hockey
41,Finland,1948 Summer,Summer,London,Bronze,Gymnastics,Gymnastics Men's Individual All-Around


In [8]:
no_null_olympic_cat_df["Medal"].value_counts()

Gold      13372
Bronze    13295
Silver    13116
Name: Medal, dtype: int64

In [24]:
pd.read_sql_query('select * from region', con=engine).head()

Unnamed: 0,id,noc,region
0,1,AFG,Afghanistan
1,2,AHO,Curacao
2,3,ALB,Albania
3,4,ALG,Algeria
4,5,AND,Andorra


In [9]:
new_data_df = no_null_olympic_cat_df
new_data_df.head()

Unnamed: 0,Team,Games,Season,City,Medal,Sport,Event
3,Denmark/Sweden,1900 Summer,Summer,Paris,Gold,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
37,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 200 metres Breaststroke
38,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 400 metres Breaststroke
40,Finland,2014 Winter,Winter,Sochi,Bronze,Ice Hockey,Ice Hockey Men's Ice Hockey
41,Finland,1948 Summer,Summer,London,Bronze,Gymnastics,Gymnastics Men's Individual All-Around


In [10]:
new_data_df["Sport"].unique()

array(['Tug-Of-War', 'Swimming', 'Ice Hockey', 'Gymnastics',
       'Alpine Skiing', 'Handball', 'Hockey', 'Rowing', 'Football',
       'Speed Skating', 'Sailing', 'Cycling', 'Fencing', 'Taekwondo',
       'Athletics', 'Canoeing', 'Water Polo', 'Wrestling',
       'Modern Pentathlon', 'Figure Skating', 'Golf', 'Softball',
       'Boxing', 'Basketball', 'Nordic Combined', 'Diving', 'Baseball',
       'Volleyball', 'Cross Country Skiing', 'Bobsleigh', 'Curling',
       'Shooting', 'Judo', 'Equestrianism', 'Tennis', 'Rugby Sevens',
       'Rhythmic Gymnastics', 'Weightlifting', 'Badminton',
       'Beach Volleyball', 'Ski Jumping', 'Rugby',
       'Short Track Speed Skating', 'Biathlon', 'Lacrosse',
       'Synchronized Swimming', 'Archery', 'Freestyle Skiing',
       'Triathlon', 'Polo', 'Luge', 'Table Tennis', 'Snowboarding',
       'Cricket', 'Skeleton', 'Racquets', 'Military Ski Patrol',
       'Croquet', 'Art Competitions', 'Roque', 'Alpinism', 'Trampolining',
       'Basque Pelota',

In [11]:
sports = ["Swimming", "Speed Skating"]
new_data_df.loc[new_data_df["Sport"].isin(sports)]

Unnamed: 0,Team,Games,Season,City,Medal,Sport,Event
37,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 200 metres Breaststroke
38,Finland,1920 Summer,Summer,Antwerpen,Bronze,Swimming,Swimming Men's 400 metres Breaststroke
110,Norway,1952 Winter,Winter,Oslo,Bronze,Speed Skating,"Speed Skating Men's 1,500 metres"
113,Norway,1960 Winter,Winter,Squaw Valley,Gold,Speed Skating,"Speed Skating Men's 1,500 metres"
210,Hungary,1936 Summer,Summer,Berlin,Bronze,Swimming,Swimming Men's 4 x 200 metres Freestyle Relay
476,Canada,1984 Summer,Summer,Los Angeles,Bronze,Swimming,Swimming Women's 4 x 100 metres Medley Relay
740,South Africa,1956 Summer,Summer,Melbourne,Bronze,Swimming,Swimming Women's 4 x 100 metres Freestyle Relay
803,Soviet Union,1972 Summer,Summer,Munich,Silver,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay
804,Soviet Union,1972 Summer,Summer,Munich,Bronze,Swimming,Swimming Men's 4 x 200 metres Freestyle Relay
814,Australia,2016 Summer,Summer,Rio de Janeiro,Bronze,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay


## Create Database Connection

In [12]:
conn_string = "mysql://root:CANCER#72@localhost:3306/olympic_db"
engine = create_engine(conn_string)

## Check for tables

In [13]:
engine.table_names()

['team']

In [14]:
event_list = [len(new_data_df.iloc[x,6]) for x in range(len(new_data_df))]
# event_list.index(max(event_list))

In [15]:
max(event_list)

85

In [16]:
new_data_df.iloc[23684,6]

"Shooting Men's Military Rifle, 1873-1874 Gras Model, Kneeling Or Standing, 200 metres"

In [17]:
new_data_df.to_sql(name="team", con=engine, if_exists='append', index=False)