## S06-L001 - wprowadzenie do grupowania

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
import time

In [2]:
df = pd.read_csv('course-files/marathon_results_2016.csv', index_col='Bib', \
                 usecols=['Bib','40K','Half','Pace','Age','M/F','Country','State','City'])
df['40K'] = df['40K'].apply(pd.to_timedelta)
df['Half'] = df['Half'].apply(pd.to_timedelta)
df['TotalSeconds'] = df['40K'].apply(lambda x: timedelta.total_seconds(x))
df['HalfSeconds'] = df['Half'].apply(lambda x: timedelta.total_seconds(x))

In [3]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 26630 entries, 5 to 28487
Data columns (total 10 columns):
Age             26630 non-null int64
M/F             26630 non-null object
City            26629 non-null object
State           23783 non-null object
Country         26630 non-null object
Half            26630 non-null timedelta64[ns]
40K             26630 non-null timedelta64[ns]
Pace            26630 non-null object
TotalSeconds    26630 non-null float64
HalfSeconds     26630 non-null float64
dtypes: float64(2), int64(1), object(5), timedelta64[ns](2)
memory usage: 2.2+ MB


In [4]:
df.describe()

Unnamed: 0,Age,Half,40K,TotalSeconds,HalfSeconds
count,26630.0,26630,26630,26630.0,26630.0
mean,42.514044,0 days 01:49:25.497070,0 days 03:41:49.984829,13309.984829,6565.497071
std,11.347955,0 days 00:18:06.943013,0 days 00:39:13.302840,2353.302841,1086.943013
min,18.0,0 days 00:00:00,0 days 00:00:00,0.0,0.0
25%,34.0,0 days 01:36:51,0 days 03:14:50.250000,11690.25,5811.0
50%,43.0,0 days 01:47:00,0 days 03:35:13,12913.0,6420.0
75%,51.0,0 days 01:58:32,0 days 04:02:20,14540.0,7112.0
max,83.0,0 days 04:47:17,0 days 07:59:12,28752.0,17237.0


In [5]:
for column in df.columns:
    print(column.upper())
    print('_____________________')
    print(df[column].value_counts())
    print('.............................................')

AGE
_____________________
45    1074
46    1030
40     904
47     869
41     851
      ... 
77       6
80       4
83       2
79       2
81       1
Name: Age, Length: 65, dtype: int64
.............................................
M/F
_____________________
M    14463
F    12167
Name: M/F, dtype: int64
.............................................
CITY
_____________________
Boston               722
New York             451
Chicago              261
Toronto              223
San Francisco        197
                    ... 
East Malvern, Vic      1
St Andrews             1
Hendricks              1
Northlake              1
Signal Mountai         1
Name: City, Length: 5832, dtype: int64
.............................................
STATE
_____________________
MA    4636
CA    1930
NY    1392
TX     974
PA     895
      ... 
YT       5
GU       4
AE       4
AP       1
VI       1
Name: State, Length: 67, dtype: int64
.............................................
COUNTRY
_____________________
USA

In [6]:
cities = df['City'].unique()

In [7]:
cities

array(['Addis Ababa', 'Ambo', 'Kitale', ..., 'Justin', 'Rossmoor',
       'Reedsburg'], dtype=object)

In [8]:
groups = {}

In [None]:
groups['San Franciso']

In [16]:
for city in cities:
    temp = df.loc[df['City'] == city]
    groups[city] = temp

In [18]:
groups['San Francisco'].describe()

Unnamed: 0,Age,Half,40K,TotalSeconds,HalfSeconds
count,197.0,197,197,197.0,197.0
mean,36.309645,0 days 01:42:24.390862,0 days 03:25:35.335025,12335.335025,6144.390863
std,10.600373,0 days 00:15:31.836366,0 days 00:34:13.037100,2053.037101,931.836367
min,22.0,0 days 01:14:21,0 days 02:29:21,8961.0,4461.0
25%,28.0,0 days 01:30:11,0 days 03:02:06,10926.0,5411.0
50%,34.0,0 days 01:41:50,0 days 03:21:13,12073.0,6110.0
75%,42.0,0 days 01:48:48,0 days 03:39:35,13175.0,6528.0
max,80.0,0 days 02:45:21,0 days 05:39:13,20353.0,9921.0


In [19]:
groups['Addis Ababa'].describe()

Unnamed: 0,Age,Half,40K,TotalSeconds,HalfSeconds
count,5.0,5,5,5.0,5.0
mean,25.0,0 days 01:08:30.200000,0 days 02:13:47.600000,8027.6,4110.2
std,4.84768,0 days 00:03:55.793765,0 days 00:10:07.503744,607.503745,235.793766
min,20.0,0 days 01:06:44,0 days 02:05:59,7559.0,4004.0
25%,21.0,0 days 01:06:45,0 days 02:06:47,7607.0,4005.0
50%,24.0,0 days 01:06:45,0 days 02:10:57,7857.0,4005.0
75%,29.0,0 days 01:06:45,0 days 02:14:23,8063.0,4005.0
max,31.0,0 days 01:15:32,0 days 02:30:52,9052.0,4532.0
