## Introduction 

The official site of this contest: http://iditarod.com/

## Setup

In [77]:
### Load required packages
import numpy as np 
import pandas as pd 
import scipy as sp
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns

import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True) # run at the start of every ipython notebook so that we can work offline
import warnings
warnings.filterwarnings('ignore')

In [78]:
### load the dataset 
df = pd.read_csv('D:/Dataset/Side_project_Iditarod/report.csv', encoding='utf-8')

print('Shape:')
print(df.shape)

print('\nInformation:')
print(df.info())

print('\nSome examples:')
print(df.head())

Shape:
(1146, 17)

Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1146 entries, 0 to 1145
Data columns (total 17 columns):
Number            1146 non-null int64
Name              1146 non-null object
Status            1146 non-null object
Country           1146 non-null object
Checkpoint        1146 non-null object
Latitude          1146 non-null float64
Longitude         1146 non-null float64
Distance          1074 non-null float64
Time              1146 non-null float64
Speed             1074 non-null float64
Arrival Date      1074 non-null object
Arrival Time      1074 non-null object
Arrival Dogs      1074 non-null float64
Elapsed Time      1145 non-null float64
Departure Date    1074 non-null object
Departure Time    1074 non-null object
Departure Dogs    1074 non-null float64
dtypes: float64(8), int64(1), object(8)
memory usage: 152.3+ KB
None

Some examples:
   Number             Name   Status         Country Checkpoint  Latitude  \
0       2   Ryan Redington  Ve

It looks like that most of the NAs follow certain rules, for example, at the starting checkpoint, we wouldn't have data on arrival-related features. However, in the column "Elapsed Time", there is only one NA, let's check it out.

In [79]:
### Check the NAs
print('Print out the contestant:')
print(df.ix[df['Elapsed Time'].isnull(), :])

print('\nCount of this contestant:')
print(np.sum(df['Name'] == 'Otto Balogh'))

print('\nCheck the count of other contestants:')
print(df['Name'].value_counts().sort_values(ascending=True).head(10))

Print out the contestant:
   Number         Name  Status  Country Checkpoint  Latitude  Longitude  \
1       3  Otto Balogh  Rookie  Hungary  Fairbanks   64.8321   -147.813   

   Distance  Time  Speed Arrival Date Arrival Time  Arrival Dogs  \
1       NaN   0.0    NaN          NaN          NaN           NaN   

   Elapsed Time Departure Date Departure Time  Departure Dogs  
1           NaN            NaN            NaN             NaN  

Count of this contestant:
1

Check the count of other contestants:
Otto Balogh         1
Mark May            4
Mark Selland        5
Ryan Anderson       6
Ellen Halverson     8
Dave Branholm      10
Monica Zappa       12
Roger Lee          12
Misha Wiljes       17
Jimmy Lebling      17
Name: Name, dtype: int64


From the table above, we can find that the contestant Otto Balogh only has one data on the starting checkpoint, which may mean he have to stop participating the competition due to some reasons, I decide to remove this data from the dataset.

In [80]:
### Remove the contestant who cannot participate in the competition
df_rem = df.ix[~df['Elapsed Time'].isnull(), :]
print(df_rem.info()) # for checking

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1145 entries, 0 to 1145
Data columns (total 17 columns):
Number            1145 non-null int64
Name              1145 non-null object
Status            1145 non-null object
Country           1145 non-null object
Checkpoint        1145 non-null object
Latitude          1145 non-null float64
Longitude         1145 non-null float64
Distance          1074 non-null float64
Time              1145 non-null float64
Speed             1074 non-null float64
Arrival Date      1074 non-null object
Arrival Time      1074 non-null object
Arrival Dogs      1074 non-null float64
Elapsed Time      1145 non-null float64
Departure Date    1074 non-null object
Departure Time    1074 non-null object
Departure Dogs    1074 non-null float64
dtypes: float64(8), int64(1), object(8)
memory usage: 161.0+ KB
None


## EDA

#### How many contestants are there for each country

In [81]:
### Count of unique contestants
print('Number of unique contestants (excluding one that has been removed):')
print(len(df_rem['Name'].value_counts()))

### Group by the country
con_country = df_rem['Name'].groupby(df_rem['Country'])
print('\nDistributioin of contestants by country:')
print(con_country.nunique().sort_values(ascending=False))

Number of unique contestants (excluding one that has been removed):
71

Distributioin of contestants by country:
Country
United States     60
Norway             4
France             2
Canada             2
Sweden             1
England            1
Czech Republic     1
Name: Name, dtype: int64


In [89]:
### Visualize

tb = con_country.nunique().sort_values(ascending=False)

trace = go.Bar(
    x = list(tb.index),
    y = list(tb.values),
    marker = dict(
    color = '#FF3333'))

layout = go.Layout(
    title = 'Count of Contestants by Country')

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

#### How many contestants are veterans and rookies?

In [106]:
### check the classes of status 
print(df_rem['Status'].value_counts()) 

### drop the duplicate data to calculate the true number for both classes
ind_con = df_rem.drop_duplicates(['Name'])[['Name', 'Status']].set_index('Name')
tb = ind_con['Status'].value_counts().sort_values(ascending=False)
print('\nTrue distribution of status:')
print(tb)

Veteran    889
Rookie     256
Name: Status, dtype: int64

True distribution of status:
Veteran    55
Rookie     16
Name: Status, dtype: int64


In [105]:
### Visualize 

trace = go.Bar(
    x = list(tb.index),
    y = list(tb.values),
    width=0.3,
    marker = dict(
    color = '#FF3333'))

layout = go.Layout(
    title = 'Count of Status')

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

#### The toughest route

In [151]:
### Create the table 
group_checkpoint = df_rem.groupby(['Checkpoint'])

checkpoint_tb = df_rem.drop_duplicates(['Checkpoint'])[['Checkpoint', 'Latitude', 'Longitude']].set_index('Checkpoint')
checkpoint_tb = pd.merge(
    checkpoint_tb,
    pd.DataFrame(group_checkpoint['Speed'].mean()),
    right_index = True,
    left_index = True
)

print(checkpoint_tb.head()) # for checking

            Latitude  Longitude      Speed
Checkpoint                                
Elim         64.6162  -162.2494   7.937344
Fairbanks    64.8321  -147.8130        NaN
Galena       64.7322  -156.9352   8.207246
Huslia       65.6568  -156.4550   5.959706
Kaltag       64.3138  -158.7257  11.056418


In [116]:
968.0/()

4.947097796321192

In [183]:
### Get the level of toughness (measured by the average speed)
checkpoint_tb.fillna(100, inplace=True)

### rank the toughness
checkpoint_tb['Level'] = checkpoint_tb['Speed'].rank(ascending=False)
checkpoint_tb['Level'] = checkpoint_tb['Level'].astype(int)

### Sort the dataframe by the order of checkpoint
cps = df_rem.ix[df['Name'] == 'Mitch Seavey', :]['Checkpoint'].values
ind = np.arange(len(cps))

cps_order = dict(zip(cps, ind))
checkpoint_tb['Checkpoint'] = checkpoint_tb.index
checkpoint_tb['Order'] = checkpoint_tb['Checkpoint'].map(cps_order)
checkpoint_tb = checkpoint_tb.sort_values(['Order'])
print(checkpoint_tb.head()) # for checking

            Latitude  Longitude       Speed  Level Checkpoint  Order
Checkpoint                                                          
Fairbanks    64.8321  -147.8130  100.000000      1  Fairbanks      0
Nenana       64.5952  -149.0897    9.833380      3     Nenana      1
Manley       65.0317  -150.6352    6.238169     13     Manley      2
Tanana       65.2010  -152.0816    6.499296     12     Tanana      3
Ruby         64.7386  -155.5011    5.350286     16       Ruby      4


In [213]:
### Visualize
mapbox_access_token = 'pk.eyJ1IjoieG5pcGVyIiwiYSI6ImNqMDR6cXR0aDBoNm4ycWxzcTF2Z3ZxbGsifQ.dAlvq0ZttViD4l3HRbqeYw'

scl = [[0, 'rgb(255, 200, 180)'], [1, 'rgb(255, 0, 0)']]

data = go.Data([    

    go.Scattermapbox(
        lat=list(checkpoint_tb['Latitude'].values),
        lon=list(checkpoint_tb['Longitude'].values),
        mode="lines+markers",
        marker=go.Marker(
            size=15,
            color=list(checkpoint_tb['Level'].values),
                        colorscale=scl,
        cmin=0,
        cmax=checkpoint_tb['Level'].values.max(),
        colorbar=dict()
        ),
        line = go.Line(
            width=1.2,
            color='#444444'
            ),
        text=list(checkpoint_tb.index),
    ),])

layout = go.Layout(
    showlegend=False,
    autosize=False,
    width=800,
    height=600,
    title='The Distribution of Toughness of Route in the Contest',
    hovermode='closest',
    margin=go.Margin(
        l=50,
        r=10,
        b=50,
        t=100,
        pad=2
    ),
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=np.mean(checkpoint_tb['Latitude']),
            lon=np.mean(checkpoint_tb['Longitude'])
        ),
        pitch=0,
        zoom=4.5
    ),
)

fig = dict(data=data, layout=layout)
py.offline.iplot(fig)