## Parte 1 - Carregamento e limpeza dos dados

In [92]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/LocTreino_Equipe_4.csv')
bts = pd.read_csv('data/Dados_BTSs.csv')

df.head()

Unnamed: 0,pontoId,lat,lon,pathBTS1,pathBTS2,pathBTS3,pathBTS4,pathBTS5,pathBTS6,taBTS1,taBTS2,taBTS3,taBTS4,taBTS5,taBTS6
0,2818,-8.062.787,-34.897.289,131.79,105.09,125.656.666.666.667,105.756.666.666.667,128.49,115.99,1,2,3,2,1,0
1,212,-8.076.408,-34.899.723,120.59,112.123.333.333.333,124.856.666.666.667,132.023.333.333.333,135.523.333.333.333,141.19,2,1,1,1,2,2
2,500,-8.074.771,-34.890.739,118.723.333.333.333,128.556.666.666.667,130.09,989.566.666.666.667,128.59,146.423.333.333.333,1,0,3,0,1,2
3,2094,-806.803,-34.896.683,111.49,109.723.333.333.333,108.056.666.666.667,973.566.666.666.667,118.456.666.666.667,997.233.333.333.333,0,1,2,1,1,0
4,1790,-8.069.077,-34.900.505,123.29,114.34,121.79,126.09,130.09,109.04,1,1,2,1,2,1


In [89]:
bts

Unnamed: 0,name,lat,lon,band,bcch,eirp,bts
0,136_2,-8.068361,-34.892722,GSM1800,662,55.59,BTS-1
1,137_3,-8.075917,-34.894611,GSM1800,806,55.59,BTS-2
2,197_1,-8.076361,-34.908,GSM1800,666,55.59,BTS-3
3,137_1,-8.075917,-34.894611,GSM1800,690,55.59,BTS-4
4,138_2,-8.066,-34.889444,GSM1800,673,55.59,BTS-5
5,245_2,-8.064583,-34.894583,GSM1800,682,55.59,BTS-6


### Observacoes

- As colunas de "lat" e "lon" em ambos os Dataframes estão com valores desformatados.
- As colunas de "pathBTSn" precisam de uma formatacao igual as colunas de "lat" e "lon".


In [93]:
def format_cols(x):
    if 'nan' in str(x).lower():
        return np.nan
    value = str(x)
    value = value.split('.')
    value = value[0] + '.' + ''.join(value[1:])
    value = float(value)
    return value

def correct_lats(x):
    if np.abs(x) > 90:
        x = x/100
    elif np.abs(x) > 10:
        x = x/10
    return x

def correct_lons(x):
    if np.abs(x) > 180:
        x = x/100
    if np.abs(x) < 10:
        x = x*10
    return x

def format_taBTS1(x):
    value = str(x)
    if '.' in value:
        value = value.split('.')
        value = value[0] + '.' + ''.join(value[1:])
        value = float(value)
    else:
        value = float(value)
    
    if value > 10:
        # outliers
        value = np.nan

    return value

df['lat'] = df['lat'].apply(format_cols)
df['lat'] = df['lat'].apply(correct_lats)
df['lon'] = df['lon'].apply(format_cols)
df['lon'] = df['lon'].apply(correct_lons)

bts['lat'] = bts['lat'].apply(format_cols)
bts['lat'] = bts['lat'].apply(correct_lats)
bts['lon'] = bts['lon'].apply(format_cols)
bts['lon'] = bts['lon'].apply(correct_lons)

for i in range(1, 7):
    df[f'pathBTS{i}'] = df[f'pathBTS{i}'].apply(format_cols)
    df[f'pathBTS{i}'] = df[f'pathBTS{i}'].apply(lambda x: np.round(x, 6))

df['taBTS1'] = df['taBTS1'].apply(format_taBTS1)

df_nan = df[df['taBTS1'].isna()]
df_nan = df_nan.append(df[df['pathBTS1'].isna()])
df_nan

Unnamed: 0,pontoId,lat,lon,pathBTS1,pathBTS2,pathBTS3,pathBTS4,pathBTS5,pathBTS6,taBTS1,taBTS2,taBTS3,taBTS4,taBTS5,taBTS6
153,595,-8.074224,-34.890263,103.223333,142.323333,124.39,121.89,122.556667,136.223333,,0,3,0,1,2
702,1977,-8.06858,-34.897083,118.54,120.84,118.74,112.19,123.74,102.49,,1,2,1,1,0
1294,1663,-8.069399,-34.891121,104.523333,135.09,124.19,119.823333,105.39,128.856667,,1,3,1,0,1
1316,1597,-8.069582,-34.896412,110.423333,999.233333,114.59,899.566667,113.156667,117.223333,,1,2,1,1,1
1499,2241,-8.06687,-34.894665,122.023333,127.69,127.056667,125.99,125.356667,954.233333,,1,3,1,1,0
269,2442,-8.065791,-34.893631,,120.94,125.64,125.29,120.74,115.84,0.0,2,3,2,0,0


Decidimos optar por dropar as linhas com valores NaN

In [94]:
df = df.drop(df_nan.index)
df.describe()

Unnamed: 0,pontoId,lat,lon,pathBTS1,pathBTS2,pathBTS3,pathBTS4,pathBTS5,pathBTS6,taBTS1,taBTS2,taBTS3,taBTS4,taBTS5,taBTS6
count,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0
mean,1468.635877,-8.070097,-34.895126,142.914764,153.502501,135.817031,156.207344,140.815007,142.008707,0.676037,1.016734,2.47925,1.016734,1.139224,1.063588
std,856.69958,0.004288,0.004308,144.435262,160.821598,98.801912,175.798372,120.517006,116.903139,0.702689,0.835651,0.957523,0.835651,0.835836,0.681277
min,2.0,-8.07757,-34.90683,85.59,13.328756,88.34,85.34,1.260525,1.084025,0.0,0.0,1.0,0.0,0.0,0.0
25%,721.5,-8.073285,-34.898086,107.99,113.044167,119.165,108.744167,115.14,115.498333,0.0,0.0,2.0,0.0,1.0,1.0
50%,1438.0,-8.069998,-34.895716,118.406667,121.206666,124.219167,118.173333,124.223333,129.044167,1.0,1.0,2.0,1.0,1.0,1.0
75%,2233.75,-8.066894,-34.891978,126.223333,129.3025,130.262917,127.656667,131.39,136.348333,1.0,1.0,3.0,1.0,2.0,2.0
max,2956.0,-8.059338,-34.885067,999.233333,999.636842,980.717204,998.566667,999.566667,997.233333,3.0,4.0,5.0,4.0,3.0,2.0


In [95]:
bts.describe()

Unnamed: 0,lat,lon,bcch,eirp
count,6.0,6.0,6.0,6.0
mean,-8.07119,-34.895662,696.5,55.59
std,0.005477,0.00637,54.617763,7.783606e-15
min,-8.076361,-34.908,662.0,55.59
25%,-8.075917,-34.894611,667.75,55.59
50%,-8.072139,-34.894597,677.5,55.59
75%,-8.06659,-34.893187,688.0,55.59
max,-8.064583,-34.889444,806.0,55.59


## Parte 2 - Preprocessamento dos dados

In [96]:
def calculate_distance(lat1, lon1, lat2, lon2):
    """Calcula distance entre duas coordenadas em metros."""
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    m = 6367 * c * 1000
    return m

distance = pd.DataFrame(columns=['pontoId', 'BTS1', 'BTS2', 'BTS3', 'BTS4', 'BTS5', 'BTS6'])

for i, row in df.iterrows():
    lat, lon = row['lat'], row['lon']
    pontoId = row['pontoId']
    
    # calcula distancia para bts1
    lat1, lon1 = bts['lat'][0], bts['lon'][0]
    bts1 = calculate_distance(lat, lon, lat1, lon1)

    # calcula distancia para bts2
    lat2, lon2 = bts['lat'][1], bts['lon'][1]
    bts2 = calculate_distance(lat, lon, lat2, lon2)

    # calcula distancia para bts3
    lat3, lon3 = bts['lat'][2], bts['lon'][2]
    bts3 = calculate_distance(lat, lon, lat3, lon3)

    # calcula distancia para bts4
    lat4, lon4 = bts['lat'][3], bts['lon'][3]
    bts4 = calculate_distance(lat, lon, lat4, lon4)

    # calcula distancia para bts5
    lat5, lon5 = bts['lat'][4], bts['lon'][4]
    bts5 = calculate_distance(lat, lon, lat5, lon5)

    # calcula distancia para bts6
    lat6, lon6 = bts['lat'][5], bts['lon'][5]
    bts6 = calculate_distance(lat, lon, lat6, lon6)

    distance.loc[i] = [pontoId, bts1, bts2, bts3, bts4, bts5, bts6]

distance.describe()

Unnamed: 0,pontoId,BTS1,BTS2,BTS3,BTS4,BTS5,BTS6
count,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0,1494.0
mean,1468.635877,665.232846,825.940913,1644.888518,825.940905,921.526199,838.180627
std,856.69958,340.849074,436.795117,486.453827,436.795124,447.771958,357.776039
min,2.0,54.400367,14.389653,868.810231,14.38968,26.30091,29.071721
25%,721.5,385.68848,538.732883,1308.89294,538.732821,565.547572,587.393182
50%,1438.0,610.147924,735.748535,1636.622061,735.748523,897.804877,814.337814
75%,2233.75,912.65302,1031.546589,1968.234782,1031.546568,1303.567851,1136.586361
max,2956.0,1847.919184,2280.67297,2834.056778,2280.672931,2051.13553,1533.465918


In [123]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train_arr = []
X_test_arr = []
y_train_arr = []
y_test_arr = []

lb = LabelEncoder()

for i in range(6):
    # select cols from df
    cols = [f'pathBTS{i+1}', f'taBTS{i+1}']
    X = df.copy()[cols]
    X[f'pathBTS{i+1}'] = lb.fit_transform(X[f'pathBTS{i+1}'])
    X[f'taBTS{i+1}'] = lb.fit_transform(X[f'taBTS{i+1}'])
    y = distance[f'BTS{i+1}']
    y = lb.fit_transform(y)
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_arr.append(x_train)
    X_test_arr.append(x_test)
    y_train_arr.append(y_train)
    y_test_arr.append(y_test)


In [124]:
from sklearn.neighbors import KNeighborsRegressor

knns = []
for i in range(6):
    model = KNeighborsRegressor()
    knns.append(model)


In [131]:
# achar melhores parametros para cada knn
from sklearn.model_selection import GridSearchCV, RepeatedKFold

parameters = {'n_neighbors': list(range(1, 41))}

grids = []
for i, model in enumerate(knns):
    grid = GridSearchCV(
        model,
        parameters,
        cv=RepeatedKFold(n_splits=5, n_repeats=10),
        scoring='neg_mean_squared_error',
        return_train_score=True,
        refit=True
    )
    grid.fit(X_train_arr[i], y_train_arr[i])
    knns[i] = grid.best_estimator_
    print(f'BTS{i+1} model best params')
    print(grid.best_params_)

BTS1 model best params
{'n_neighbors': 40}


In [130]:
from sklearn.metrics import mean_squared_error

for i, model in enumerate(knns):
    pred = model.predict(X_test_arr[i])
    sqrd_error = np.sqrt(mean_squared_error(y_test_arr[i], pred))
    print(f'BTS{i+1} erro medio: {sqrd_error} m')


BTS1 erro medio: 396.9983853839857
BTS2 erro medio: 365.24315559505027
BTS3 erro medio: 291.16129349457054
BTS4 erro medio: 386.91781379217747
BTS5 erro medio: 326.7993807893169
BTS6 erro medio: 283.5413251641063
