# Dunham's Data

## Setting up

In [68]:
from geopy.distance import geodesic
# import networkx as nx
import pandas as pd
# import seaborn as sns

In [69]:
# pd.set_option('display.max_rows', 999)

In [70]:
# sns.set()

In [71]:
# %matplotlib inline

## Loading data

In [72]:
dunham_df = pd.read_csv('data/out/nureyev_1955-92.csv')
dunham_df.fillna('', inplace=True)

In [73]:
dunham_df.shape

(22, 13)

In [74]:
dunham_df.head()

Unnamed: 0,DATE,CITY1,COUNTRY1,CITY2,COUNTRY2,WORKING,HOTEL_ADDRESS,VENUE1,VENUE_TYPE1,VENUE2,VENUE_TYPE2,NOTES,SOURCE
0,9/1/1975,Edinburgh,Scotland,,,y,,Edinburgh festival,,,,,
1,9/7/1975,Glasgow,Scotland,,,y,,Reharse scottish ballet glasgow,,,,,
2,9/16/1975,Madrid,Spain,,,y,,,,,,,
3,9/22/1975,Amsterdam,Holland,,,y,,Reharse dutch ballet amsterdam,,,,,
4,10/17/1975,Vienna,Austria,,,y,,,,,,,


In [75]:
dunham_df.tail(1)

Unnamed: 0,DATE,CITY1,COUNTRY1,CITY2,COUNTRY2,WORKING,HOTEL_ADDRESS,VENUE1,VENUE_TYPE1,VENUE2,VENUE_TYPE2,NOTES,SOURCE
21,2/15/1976,Los Angeles,California,,,y,,With american ballet theatre,,,,,


## Pre-processing data

In [76]:
geolocations = { # (lat, lon)
 'Edinburgh'        :(55.9533456,-3.1883749 ),
 'Glasgow'          :(55.861155, -4.2501687 ),
 'Madrid'           :(40.4167047,-3.7035825 ),
 'Amsterdam'        :(52.3730796,4.8924534 ),
 'Vienna'           :(48.2083537,16.3725042 ),
 'Zurich'           :(47.3744489,8.5410422 ),
 'Lugano'           :(46.0050102,8.9520281 ),
 'Lausanne'         :(46.5218269,6.6327025 ),
 'Basel'            :(47.5581077,7.5878261 ),
 'Bregenz'          :(47.5025779,9.7472924 ),
 'Gallen'           :(51.4312285,12.5495463 ),
 'Geneva'           :(46.2017559,6.1466014 ),
 'The Hague'        :(52.0799838,4.3113461 ),
 'New York'         :(40.7127281,-74.0060152 ),
 'Brooklin'         :(40.691392, -73.982469),
 'London'           :(51.5074456,-0.1277653 ),
 'Manchester'       :(53.4794892,-2.2451148 ),
 'London'           :(51.5074456,-0.1277653 ),
 'New York'         :(40.7127281,-74.0060152 ),
 'London'           :(51.5074456,-0.1277653 ),
 'Paris'            :(48.8588897,2.320041022 ),
 'Los Angeles'      :(34.0536909,-118.242766 ),
} 

In [77]:
dunham_df['CITY'] = dunham_df.apply(lambda row: row.CITY2 if row.CITY2 else row.CITY1, axis=1)
dunham_df['COUNTRY'] = dunham_df.apply(lambda row: row.COUNTRY2 if row.CITY2 else row.COUNTRY1, axis=1)

## Processing data

In [78]:
def is_valid_city(city):
    return bool(city) and city != 'in transit'

cols = [
    'START_DATE', 'END_DATE',
    'CITY', 'COUNTRY',
    'N_ROWS', 'LAST_MOMENT', 'MIN_NIGHTS', 'MAX_NIGHTS',
    'LATITUDE', 'LONGITUDE',
]
itinerary_df = pd.DataFrame(columns=cols)
row = dunham_df.iloc[0]
start_date = row.DATE
end_date = row.DATE
start_city = row.CITY
start_country = row.COUNTRY
n_rows = 1
for i, row in dunham_df.iloc[1:].iterrows():
    date = row.DATE
    city = row.CITY
    country = row.COUNTRY
    if is_valid_city(city) and city == start_city:
        n_rows += 1
        end_date = date
    else:
        if is_valid_city(start_city):
            itinerary_df.loc[len(itinerary_df)] = [
                start_date,
                end_date,
                start_city,
                start_country,
                n_rows,
                'The next morning' if row.CITY2 else 'Probably that night',
                n_rows + (0 if row.CITY2 else -1),
                n_rows,
                geolocations[start_city][0],
                geolocations[start_city][1],
            ]
#             display(pd.DataFrame(itinerary_df.iloc[-1]).T)
        start_date = date
        end_date = date
        start_city = city
        start_country = country
        n_rows = 1
if city == start_city:
    if is_valid_city(start_city):
            #just debug
        print(start_city)
        #print(geolocations)  
        itinerary_df.loc[len(itinerary_df)] = [
            start_date,
            end_date,
            start_city,
            start_country,
            n_rows,
            'The next morning' if row.CITY2 else 'Probably that night',
            n_rows + (0 if row.CITY2 else -1),
            n_rows,          
            geolocations[start_city][0],
            geolocations[start_city][1],
        ]
#         display(pd.DataFrame(itinerary_df.iloc[-1]).T)

Los Angeles


In [79]:
itinerary_df.shape

(22, 10)

In [80]:
itinerary_df

Unnamed: 0,START_DATE,END_DATE,CITY,COUNTRY,N_ROWS,LAST_MOMENT,MIN_NIGHTS,MAX_NIGHTS,LATITUDE,LONGITUDE
0,9/1/1975,9/1/1975,Edinburgh,Scotland,1,Probably that night,0,1,55.953346,-3.188375
1,9/7/1975,9/7/1975,Glasgow,Scotland,1,Probably that night,0,1,55.861155,-4.250169
2,9/16/1975,9/16/1975,Madrid,Spain,1,Probably that night,0,1,40.416705,-3.703582
3,9/22/1975,9/22/1975,Amsterdam,Holland,1,Probably that night,0,1,52.37308,4.892453
4,10/17/1975,10/17/1975,Vienna,Austria,1,Probably that night,0,1,48.208354,16.372504
5,10/20/1975,10/20/1975,Zurich,Switzerland,1,Probably that night,0,1,47.374449,8.541042
6,10/21/1975,10/21/1975,Lugano,Switzerland,1,Probably that night,0,1,46.00501,8.952028
7,10/23/1975,10/23/1975,Lausanne,Switzerland,1,Probably that night,0,1,46.521827,6.632702
8,10/24/1975,10/24/1975,Basel,Switzerland,1,Probably that night,0,1,47.558108,7.587826
9,10/25/1975,10/25/1975,Bregenz,Austria,1,Probably that night,0,1,47.502578,9.747292


## Saving data

In [81]:
itinerary_df.to_csv('data/out/itinerary_1955-92.csv', index=False)