# Dunham's Data

## Setting up

In [60]:
from geopy.distance import geodesic
# import networkx as nx
import pandas as pd
# import seaborn as sns

In [61]:
# pd.set_option('display.max_rows', 999)

In [62]:
# sns.set()

In [63]:
# %matplotlib inline

## Loading data

In [64]:
dunham_df = pd.read_csv('data/out/nureyev_1955-92.csv')
dunham_df.fillna('', inplace=True)

In [65]:
dunham_df.shape

(292, 13)

In [66]:
dunham_df.head()

Unnamed: 0,DATE,CITY1,COUNTRY1,CITY2,COUNTRY2,WORKING,HOTEL_ADDRESS,VENUE1,VENUE_TYPE1,VENUE2,VENUE_TYPE2,NOTES,SOURCE
0,1955-01-01 00:00:00,Moscow,Russia,,,y,,DEKADA - Bashkirian Decade of Literature & Art,,,,,
1,1955-08-24 00:00:00,Leningrad,Russia,,,y,,,,,,,
2,1956-01-01 00:00:00,Leningrad,Russia,,,y,,Kirov Theatre,,,,,
3,1956-02-01 00:00:00,Leningrad,Russia,,,y,,Kirov Theatre,,,,,
4,1956-03-01 00:00:00,Leningrad,Russia,,,y,,Kirov Theatre,,,,,


In [67]:
dunham_df.tail(1)

Unnamed: 0,DATE,CITY1,COUNTRY1,CITY2,COUNTRY2,WORKING,HOTEL_ADDRESS,VENUE1,VENUE_TYPE1,VENUE2,VENUE_TYPE2,NOTES,SOURCE
291,31/8/1992,Paris,France,,,y,,Palais Garnier,,,,,


## Pre-processing data

In [68]:
geolocations = { # (lat, lon)
    'Mexico City': (19.433333, -99.133333),
    'New York City': (40.7127, -74.0059),
    'San Diego': (32.715, -117.1625),
    'San Francisco': (37.783333, -122.416667),
    'Stockton, CA': (37.975556, -121.300833),
    'Amsterdam' : ( 52.3730796,4.8924534),
    'Baltimore' : ( 39.2908816,-76.610759),
    'Bath' : ( 51.3813864,-2.3596963),
    'Berlin' : ( 52.510885,13.3989367),
    'Bologna' : ( 44.4938203,11.3426327),
    'Brussels' : ( 50.8465573,4.351697),
    'Budapest' : ( 47.4978789,19.0402383),
    'Cannes' : ( 43.5515198,7.0134418),
    'Chicago' : ( 41.8755616,-87.6244212),
    'Copenhagen' : ( 55.6867243,12.5700724),
    'Deauville' : ( 49.36,0.0752778),
    'Dublin' : ( 53.3493795,-6.2605593),
    'Florence' : ( 43.7697955,11.2556404),
    'Frankfurt' : ( 50.1106444,8.6820917),
    'Genoa' : ( 44.40726,8.9338624),
    'Hamburg' : ( 53.550341,10.000654),
    'Holmdel' : ( 40.3451095,-74.1840322),
    'jerusalem' : ( 31.7788242,35.2257626),
    'Leningrad' : ( 59.938732,30.316229),
    'Liverpool' : ( 53.4071991,-2.99168),
    'London' : ( 51.5074456,-0.1277653),
    'Madrid' : ( 40.4167047,-3.7035825),
    'Mantua' : ( 45.1692628,10.67083652),
    'Marseille' : ( 43.2961743,5.3699525),
    'Melbourne' : ( -37.8142454,144.9631732),
    'Mexico City' : ( 19.4326296,-99.1331785),
    'Milano' : ( 45.4641943,9.1896346),
    'Monte Carlo' : ( 43.7402961,7.426559),
    'Montreal' : ( 45.5031824,-73.5698065),
    'Moscow' : ( 55.7505412,37.6174782),
    'Nervi' : ( 44.3833121,9.0391295),
    'New York' : ( 40.7127281,-74.0060152),
    'North Russia' : ( 40.2347473,-84.4071014),
    'Palma de Mallorca' : ( 39.5695818,2.6500745),
    'Paris' : ( 48.8588897,2.320041022),
    'Rome' : ( 41.8933203,12.4829321),
    'Scheveningen' : ( 52.1067449,4.2736937),
    'Spoleto' : ( 52.663642,-1.580326),
    'Stockholm' : ( 59.3251172,18.0710935),
    'Stuttgart' : ( 48.7784485,9.1800132),
    'Toronto' : ( 43.6534817,-79.3839347),
    'Turin' : ( 45.0677551,7.6824892),
    'Utrecht' : ( 52.0809856,5.127683969),
    'Venice' : ( 45.4371908,12.3345898),
    'Verona' : ( 45.4424977,10.98573769),
    'Vienna' : ( 48.2083537,16.3725042), 
}

In [69]:
dunham_df['CITY'] = dunham_df.apply(lambda row: row.CITY2 if row.CITY2 else row.CITY1, axis=1)
dunham_df['COUNTRY'] = dunham_df.apply(lambda row: row.COUNTRY2 if row.CITY2 else row.COUNTRY1, axis=1)

## Processing data

In [70]:
def is_valid_city(city):
    return bool(city) and city != 'in transit'

cols = [
    'START_DATE', 'END_DATE',
    'CITY', 'COUNTRY',
    'N_ROWS', 'LAST_MOMENT', 'MIN_NIGHTS', 'MAX_NIGHTS',
    'LATITUDE', 'LONGITUDE',
]
itinerary_df = pd.DataFrame(columns=cols)
row = dunham_df.iloc[0]
start_date = row.DATE
end_date = row.DATE
start_city = row.CITY
start_country = row.COUNTRY
n_rows = 1
for i, row in dunham_df.iloc[1:].iterrows():
    date = row.DATE
    city = row.CITY
    country = row.COUNTRY
    if is_valid_city(city) and city == start_city:
        n_rows += 1
        end_date = date
    else:
        if is_valid_city(start_city):
            itinerary_df.loc[len(itinerary_df)] = [
                start_date,
                end_date,
                start_city,
                start_country,
                n_rows,
                'The next morning' if row.CITY2 else 'Probably that night',
                n_rows + (0 if row.CITY2 else -1),
                n_rows,
                geolocations[start_city][0],
                geolocations[start_city][1],
            ]
#             display(pd.DataFrame(itinerary_df.iloc[-1]).T)
        start_date = date
        end_date = date
        start_city = city
        start_country = country
        n_rows = 1
if city == start_city:
    if is_valid_city(start_city):
            #just debug
        #print(start_city)
        #print(geolocations)  
        itinerary_df.loc[len(itinerary_df)] = [
            start_date,
            end_date,
            start_city,
            start_country,
            n_rows,
            'The next morning' if row.CITY2 else 'Probably that night',
            n_rows + (0 if row.CITY2 else -1),
            n_rows,          
            geolocations[start_city][0],
            geolocations[start_city][1],
        ]
#         display(pd.DataFrame(itinerary_df.iloc[-1]).T)

In [71]:
itinerary_df.shape

(123, 10)

In [72]:
itinerary_df

Unnamed: 0,START_DATE,END_DATE,CITY,COUNTRY,N_ROWS,LAST_MOMENT,MIN_NIGHTS,MAX_NIGHTS,LATITUDE,LONGITUDE
0,1955-01-01 00:00:00,1955-01-01 00:00:00,Moscow,Russia,1,Probably that night,0,1,55.750541,37.617478
1,1955-08-24 00:00:00,1958-03-02 00:00:00,Leningrad,Russia,13,Probably that night,12,13,59.938732,30.316229
2,1958-04-21 00:00:00,1958-05-22 00:00:00,Moscow,Russia,4,Probably that night,3,4,55.750541,37.617478
3,1958-06-19 00:00:00,1959-06-28 00:00:00,Leningrad,Russia,15,Probably that night,14,15,59.938732,30.316229
4,1959-07-26 00:00:00,1959-08-04 00:00:00,Vienna,Austria,2,Probably that night,1,2,48.208354,16.372504
...,...,...,...,...,...,...,...,...,...,...
118,1992-01-01 00:00:00,1992-01-01 00:00:00,Vienna,Austria,1,Probably that night,0,1,48.208354,16.372504
119,1992-02-28 00:00:00,1992-02-28 00:00:00,Budapest,Hungary,1,Probably that night,0,1,47.497879,19.040238
120,1992-02-29 00:00:00,1992-02-29 00:00:00,Berlin,Germany,1,Probably that night,0,1,52.510885,13.398937
121,1992-03-01 00:00:00,1992-03-01 00:00:00,Budapest,Hungary,1,Probably that night,0,1,47.497879,19.040238


## Saving data

In [59]:
itinerary_df.to_csv('data/out/itinerary_1955-92.csv', index=False)