In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings 
warnings.filterwarnings(action='ignore')
%matplotlib inline

import sys
import gc
import os
import math
import time
from geopy.distance import vincenty

In [2]:
sys.path.append('../Utils/')

In [3]:
data_train = pd.read_csv('../Data/Train/Train_dataset.csv')
data_train.drop('Unnamed: 0',axis = 1,inplace = True)

In [4]:
data_validation = pd.read_csv('../Data/Test/test_dataset.csv')
data_validation.drop('Unnamed: 0',axis = 1,inplace = True)

In [5]:
bs = pd.read_csv('bs.csv') #bs contains the latitude and longitude of each base station

In [6]:
list_bsid = list(np.sort(data_train['bsid'].unique())) #corresponds to the index of bs

In [7]:
y = data_train[['latitude','longitude']].values

In [8]:
# get index of base station for every item in the data_train
bs_index = []
bsid = data_train['bsid'].values
for i in range(len(data_train)):
    bs_index.append(list_bsid.index(bsid[i]))

In [9]:
from numba import njit

In [10]:
@njit
def calculate_each_distance(y,bs_index, base):
    distance = np.zeros(len(y))
    
    for i in range(len(y)):
        latA = (y[i][0] * math.pi/180.0)
        lngA = (y[i][1] * math.pi/180.0)
        index = bs_index[i]
        latB = (base[index][0] * math.pi/180.0)
        lngB = (base[index][1] * math.pi/180.0)
        a = latA - latB
        b = lngA - lngB
        s = 2*6371*math.asin(math.sqrt(math.pow(math.sin(a/2),2)
                                       +math.cos(latA)*math.cos(latB)*math.pow(math.sin(b/2),2)))
        distance[i] = s
    return distance

In [11]:
#distance between device and the base station
distance = calculate_each_distance(y,bs_index, bs.values)
data_train['distance'] = pd.DataFrame(distance,columns=['distance'])

In [12]:
@njit
def calculate_each_angle(y,bs_index, base):
    angle = np.zeros(len(y))
    
    for i in range(len(y)):
        index = bs_index[i]
        x1 = base[index][0]
        y1 = base[index][1]
        x2 = y[i][0]
        y2 = y[i][1]
        xx = x2 - x1
        yy = y2 - y1
        if xx == 0:
            angle_temp = math.pi / 2.0
        else:
            angle_temp = math.atan(math.fabs(yy/xx))
            
        if xx < 0 and yy >= 0:
            angle_temp = math.pi - angle_temp
        elif xx < 0 and yy < 0:
            angle_temp = math.pi + angle_temp
        elif xx >= 0 and yy < 0:
            angle_temp = math.pi * 2.0 - angle_temp
            
        angle[i] = angle_temp
        
    return angle

In [13]:
# angles between device and the base station
angle = calculate_each_angle(y,bs_index, bs.values)
data_train['angle'] = pd.DataFrame(angle,columns=['angle'])

In [14]:
data_train['rssi2'] = 10 ** ((np.abs(data_train['rssi'])-47)/38.75)
data_validation['rssi2'] = 10 ** ((np.abs(data_validation['rssi'])-47)/38.75)

In [15]:
#Some items of 'data_type' are wirtten as 'Wifi' instead of 'wifi'
data_train['data_type'] = data_train['data_type'].str.lower()
data_validation['data_type'] = data_validation['data_type'].str.lower()

In [16]:
data_type_train = pd.get_dummies(data_train['data_type'], drop_first=True)
data_type_validation = pd.get_dummies(data_validation['data_type'], drop_first=True)

In [17]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', dtype='uint8').fit(data_train['dtid'].values.reshape(-1, 1))
dtid_train = pd.DataFrame(ohe.transform(data_train['dtid'].values.reshape(-1, 1)).A)
dtid_validation = pd.DataFrame(ohe.transform(data_validation['dtid'].values.reshape(-1, 1)).A)

In [18]:
X_train = pd.concat([data_train['rssi'],data_train['rssi2'],data_train['snr'],data_type_train, dtid_train],axis=1)

In [19]:
X_test = pd.concat([data_validation['rssi'],data_validation['rssi2'],data_validation['rssi'],data_type_validation, dtid_validation],axis=1)

In [20]:
y_train = data_train['distance']

In [21]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

I tried linear regression, random forest and the xgboost. The random forest(n_estimators = 100, max_depth=10) and the xgboost(with the parameters: n_estimators = 100, learning_rate = 0.08) would take nearly one hour to fit the train and test. On the contrary, the linear regression can fit quickly can has even a better performance on the the predict result. So finally I chose to apply linear regression model.

In [22]:
y_test = pd.DataFrame(reg.predict(X_test),columns=['distance'])

In [23]:
g = data_validation.groupby('messageid').groups

In [24]:
import scipy.sparse

In [25]:
#create a 159-columns matrix, each row is a unique message_id, each column means the distance to this station.
dict_index={}
message_id = []
array_distance = scipy.sparse.lil_matrix((len(g), len(list_bsid)))
j = 0
for gg in g:
    temp = []
    message_id.append(data_validation['messageid'][g[gg][0]])
    for i in range(len(g[gg])):
        message_index = g[gg][i]
        temp.append(g[gg][i])
        bsid_index = list_bsid.index(data_validation['bsid'][message_index])
        message_distance = y_test['distance'][message_index]
        array_distance[j, bsid_index] = message_distance
    dict_index[j] = temp
    j += 1

In [26]:
message_id = pd.DataFrame(message_id, columns=['messageid'])

In [27]:
X = pd.DataFrame(array_distance.A)

In [28]:
#@njit
def calculate_lat_lng(X, bs):
    y = np.zeros((len(X),2))
    for i in range(len(X)):
        num_point = 0
        point_distance = []
        point_index = []
        for j in range(159):
            if X[i,j] != 0:
                point_distance.append(X[i,j])
                point_index.append(j)
                num_point += 1
        if num_point == 1:
            index = int(point_index[0])
            y[i][0] = bs[index,0]
            y[i][1] = bs[index,1]
        if num_point == 2:
            index1 = int(point_index[0])
            index2 = int(point_index[1])
            y[i][0] = ((bs[index1,0]*point_distance[1])+(bs[index2,0]*point_distance[0]))/(point_distance[1]+point_distance[0])
            y[i][1] = ((bs[index1,1]*point_distance[1])+(bs[index2,1]*point_distance[0]))/(point_distance[1]+point_distance[0])
        if num_point > 2:
            point_distance = np.array(point_distance)**2
            sum_distance = sum(point_distance)
            for k in range(num_point):
                index = int(point_index[k])
                y[i][0] = y[i][0] + (bs[index,0]*point_distance[k])/sum_distance
                y[i][1] = y[i][1] + (bs[index,1]*point_distance[k])/sum_distance
    return y  

As for the message which is only received by one base station, I don’t have method but can only set the location of this base station as the location. As for the message which is received by 2 base stations, I find a coordinate between those two base stations by use the proportion of the predicted distance to those two stations. As for the message which is received by more than 2 base stations, I’m not sure how to deal it, and I just used a rough way to calculate them.

In [29]:
y = calculate_lat_lng(X.values, bs.values)

In [30]:
df_y = pd.DataFrame(y, columns=['latitude','longitude'])

In [31]:
df_y_test = pd.concat([message_id,df_y],axis=1)

In [32]:
df_y_test.to_csv('WANG_Yuqing.csv', index=False)