#Initial data exploration and submissions

The sample submission estimates every endpoint as 41.146504,-8.61137 (Score:3.66)-- check if this is the mean of the train data.  Having problems here because some of the train data has an empty 'POLYLINE' list entry.  Cycle through manually for now.

## The Big Steps

1) Generate reatures

2) Split up data in CV folds

3) Fit and select models

4) Write predictions

Focus on 1 

In [3]:
import json
import zipfile
import pandas as pd
import numpy as np
import csv as csv
import matplotlib.pyplot as plt



#return a text parser for the large file, with chunk size "sz"
def load_train(sz=100):
    zf = zipfile.ZipFile("../data/train.csv.zip")
    return pd.read_csv(zf.open('train.csv'), chunksize=sz, iterator=True, converters={'POLYLINE': lambda x: json.loads(x)})



#remove rows with empty POLYLINE field
def rem_empty_polyline(X):
    empty_rows=[]
    for j in range(len(X['POLYLINE'])):      
        entry=X['POLYLINE'].values[j]
        if(entry==[]):
                  empty_rows.append(j)
    return X.drop(X.index[empty_rows])
    

#remove rows with incomplete GPS data (only ten cases)
def rem_missing(X):
    empty_rows=[]
    for j in range(len(X['MISSING'])):      
        entry=X['MISSING'].values[j]
        if(entry=="True"):
            empty_rows.append(j)
    return X.drop(X.index[empty_rows])    
    
    

#add the last latitude and longitude from the POLYLINE field to main dataframe X and return it 
def lat_long_last(X):

    latitudes=[]
    longitudes=[]
    
    for j in range(len(X['POLYLINE'])):      
        entry=X['POLYLINE'].values[j]
        if(len(entry)==0):
            latitudes.append(-999)
            longitudes.append(-999)
        else:
            last=entry[-1]           
            latitudes.append(last[0])
            longitudes.append(last[1])
            
    X['LAST_LAT']=longitudes 
    X['LAST_LON']=latitudes
    
    return X



#add the first latitude and longitude from the POLYLINE field to main dataframe X and return it 
def lat_long_first(X):
    
    latitudes=[]
    longitudes=[]
    
    for j in range(len(X['POLYLINE'])):      
        entry=X['POLYLINE'].values[j]
        if(len(entry)==0):
            latitudes.append(-999)
            longitudes.append(-999)
        else:
            last=entry[0]           
            latitudes.append(last[0])
            longitudes.append(last[1])
            
    X['FIRST_LAT']=longitudes 
    X['FIRST_LON']=latitudes
    
    return X



# add the second to last latitude and longitude from the POLYLINE field to main dataframe X and return it 
def lat_long_2ndToLast(X):
    
    latitudes=[]
    longitudes=[]
    
    for j in range(len(X['POLYLINE'])):      
        entry=X['POLYLINE'].values[j]
        if(len(entry)==0):
            latitudes.append(-999)
            longitudes.append(-999)
        elif(len(entry)==1):
            last=entry[-1]           
            latitudes.append(last[0])
            longitudes.append(last[1])            
        else:
            last=entry[-2]           
            latitudes.append(last[0])
            longitudes.append(last[1])
            
    X['S2L_LAT']=longitudes 
    X['S2L_LON']=latitudes
    
    return X



# The last latitude and last longitude averages are in fact different than the benchmark submission -- is benchmark the mean of the test data's last points?

In [4]:
# Find the average final latitude and longitude
train_parser=load_train()
last_latitudes=np.array([])
last_longitudes=np.array([])

for chunk in train_parser:
    chunk= rem_empty_polyline(chunk) #remove rows with empty gps data
    chunk= lat_long_last(chunk) #extract last gps coordinates
    last_latitudes=np.append(last_latitudes, chunk['LAST_LAT'].values)
    last_longitudes=np.append(last_longitudes, chunk['LAST_LON'].values)


In [5]:
last_latitudes.mean()

41.162326423197499

In [6]:
last_longitudes.mean()

-8.6200591303813017

# But how different?

Use the competition metric to determine this. With 
$$ a=\sin^2\left(\phi_2-\phi_1\right) + \cos\phi_1\cos\phi_2 \sin^2\left(\lambda_2 - \lambda_1\right) $$
where $\phi_i$ are latitudes and $\lambda_j$ are longitudes, the haversine distance is:

$$d = 2r\arctan\left(\sqrt{\frac{a}{1-a}}\right)$$

where $r=6371$.  The objective is to minimize the mean haversine distance.

In [21]:
d_2_rad=np.pi/180.0

#compute haversine distance between two coordinates (phi_1,lambda_1) and (phi_2,lambda_2)
def haversine(phi_1,lambda_1,phi_2,lambda_2):
    r=6371  #kilometers
    #r=3959 #miles
    a= np.sin(d_2_rad*(phi_2-phi_1))**2+np.cos(d_2_rad*phi_1)*np.cos(d_2_rad*phi_2)*np.sin(d_2_rad*(lambda_2-lambda_1))**2
    return 2*r*np.arctan(np.sqrt(a/(1-a)))



#compute the mean haversine distance between -- not safe, make sure all array dimensions are the same
def mean_haversine(phi_1s,lambda_1s,phi_2s,lambda_2s):

    total=0
    m=len(phi_1s)
    for j in range(m):
        #print haversine(phi_1s[j],lambda_1s[j],phi_2s[j],lambda_2s[j])
        total+=haversine(phi_1s[j],lambda_1s[j],phi_2s[j],lambda_2s[j])

    return total/m
        

In [16]:
n_points=len(last_latitudes)

mean_haversine(last_latitudes,last_longitudes,np.array([last_latitudes.mean()]*n_points), np.array([last_longitudes.mean()]*n_points))

6.3979855078247532

In [25]:
#compare with the sample submission values 41.146504,-8.61137 --seem to be off by a factor of ~2!?
mean_haversine(last_latitudes,last_longitudes,np.array([41.146504]*n_points), np.array([-8.61137]*n_points))


6.7883943411132854

# Try last point benchmark 
Score: 3.31766

In [125]:
import json
import zipfile
import pandas as pd
import numpy as np
import csv as csv
import matplotlib.pyplot as plt

In [126]:

zft = zipfile.ZipFile("../data/test.csv.zip")
#train = pd.read_csv(zf.open('train.csv'), converters={'POLYLINE': lambda x: json.loads(x)[-1:]})
#train = pd.read_csv(zf.open('train.csv'), chunksize=100, iterator=True, converters={'POLYLINE': lambda x: json.loads(x)})
test = pd.read_csv(zft.open('test.csv'), chunksize=100, iterator=True, converters={'POLYLINE': lambda x: json.loads(x)[-1]})

In [127]:
first_chunk=True
chunk['TRIP_ID']='"'+chunk['TRIP_ID']+'"'

for chunk in test:
    #chunk.to_csv('../data/1_lastPoint.csv', mode='a')
    submit_df=pd.DataFrame({'TRIP_ID':chunk['TRIP_ID'], 'LONGITUDE':np.vstack(chunk['POLYLINE'].values)[:,0], 
                            'LATITUDE':np.vstack(chunk['POLYLINE'].values)[:,1]})
    if first_chunk:
        submit_df.to_csv('../data/1_lastPoint.csv', mode='w', columns=['TRIP_ID','LATITUDE','LONGITUDE'], index=False)
        first_chunk=False
    else:
        submit_df.to_csv('../data/1_lastPoint.csv', mode='a', columns=['TRIP_ID','LATITUDE','LONGITUDE'], index=False, header=False)
    

In [46]:
test=pd.DataFrame({'TRIP_ID':chunk['TRIP_ID'], 'LONGITUDE':np.vstack(chunk['POLYLINE'].values)[:,0], 
                        'LATITUDE':np.vstack(chunk['POLYLINE'].values)[:,1]})

In [27]:
np.vstack(chunk['POLYLINE'].values)[:,0]


TypeError: string indices must be integers, not str

#Sandbox for testing commands:

In [5]:
rt=np.array([list([1,2]),list([4,5]),list([7,8])])

In [9]:
np.vstack(rt).shape

(3, 2)

In [57]:
(590.0+192)/32.0*3

73.3125

In [121]:
a=np.array(["q2","q4","q7"])

In [122]:
for i in range(len(a)):
    a[i]="\""+a[i]+"\""


In [123]:
a

array(['"q', '"q', '"q'], 
      dtype='|S2')

In [131]:
len([[3,4]])

2

In [1]:
def do_stuff():
    b=2

In [2]:
do_stuff()

In [3]:
b

NameError: name 'b' is not defined

In [2]:
import numpy as np
farts=pd.DataFrame()
farts['POLYLINE']=np.array([[[1,2],[4,5]],[],[[1.1,3.2]],[[3,4],[9,8],[1.1,4.3]]])
farts['MISSING']=np.array(["False","False","True","False"])

In [13]:
rem_empty_polyline(farts)

Unnamed: 0,POLYLINE,MISSING,S2L_LAT,S2L_LON
0,"[[1, 2], [4, 5]]",False,2.0,1.0
2,"[[1.1, 3.2]]",True,3.2,1.1
3,"[[3, 4], [9, 8], [1.1, 4.3]]",False,8.0,9.0


In [15]:
lat_long_2ndToLast(farts)

Unnamed: 0,POLYLINE,MISSING,S2L_LAT,S2L_LON
0,"[[1, 2], [4, 5]]",False,2.0,1.0
1,[],False,-999.0,-999.0
2,"[[1.1, 3.2]]",True,3.2,1.1
3,"[[3, 4], [9, 8], [1.1, 4.3]]",False,8.0,9.0


In [6]:
rem_missing(farts)

False
False
True
False


Unnamed: 0,POLYLINE,MISSING
0,"[[1, 2], [4, 5]]",False
1,[],False
3,"[[3, 4], [9, 8], [1.1, 4.3]]",False


In [16]:
farts

Unnamed: 0,POLYLINE,MISSING,S2L_LAT,S2L_LON
0,"[[1, 2], [4, 5]]",False,2.0,1.0
1,[],False,-999.0,-999.0
2,"[[1.1, 3.2]]",True,3.2,1.1
3,"[[3, 4], [9, 8], [1.1, 4.3]]",False,8.0,9.0


In [3]:
farts.MISSING.values[2]=="True"

True

In [15]:
np.arctan(1)*4

3.1415926535897931

In [16]:
np.sqrt(2)

1.4142135623730951

True

In [26]:
[1]*3

[1, 1, 1]

In [12]:
np.pi

3.141592653589793