# Signals Prediction

## Imports

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer

## Process the data

In [7]:
signals_data = pd.read_csv("../Data/cellular_signals.csv")
antennas_data = pd.read_csv("../Data/antennas.csv")

In [9]:
# Select on ly the coordinates
signals_locations = signals_data[['Latitude', 'Longitude']]
antennas_locations = antennas_data[['Latitude', 'Longitude']]

# Calculate the geodesic pairwise distances between cellphone and antennas
distances = haversine_distances(signals_locations.values, antennas_locations.values)

# For each type on antenna 
# slice the distances matrix to keep only columns which correspond to that type
# Then find the minimal distance of each signal to that antenna type
types = antennas_data.type.unique()

for type in types:
    type_distances = distances[:,(antennas_data['type']==type).to_numpy()]
    distance_to_closest_antenna_of_type =  np.min(type_distances, axis=1)*1000
    signals_data[f'closest_{type}'] =  distance_to_closest_antenna_of_type

In [10]:
# We only use those four "distance to each type" columns 
# as they encapulate all the relevant knowledge on each siganl
X = signals_data[['closest_D','closest_A','closest_C','closest_B']]
y = signals_data['signal']

In [11]:
# Split the data into test and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
# Scale the distances
scaler = MinMaxScaler()
scaler.fit(X_train)
scaler.transform(X_train)
scaler.transform(X_test)

array([[0.63333242, 0.62166719, 0.21646657, 0.95381082],
       [0.19699074, 0.19076119, 0.68962301, 0.27895352],
       [0.11608457, 0.32147197, 0.33565451, 0.16631385],
       ...,
       [0.14073098, 0.36164434, 0.47449235, 0.31147218],
       [0.40820885, 0.50750139, 0.55930726, 0.2387912 ],
       [0.10819288, 0.49983612, 0.4875328 , 0.06214457]])

## Train the model

In [13]:
# Create the parameter grid for a random search 
random_grid = {
    'bootstrap': [True],
    'max_depth': range(1,25),
    'max_features': [1.0, 'sqrt'],
    'min_samples_leaf': range(1,5),
    'min_samples_split': [2, 4, 8],
    'n_estimators': [50, 100, 200, 400, 800]
}

In [14]:
# Initiate a random search to tune the hyperparameters
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, n_iter = 10, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the model to the data
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [15]:
# Calculate the model's preformence
test_score = rf_random.score(X_test, y_test)
predictions = rf_random.predict(X_test)

print("Test score: ", test_score.round(3))
print("RMSE: ", mean_squared_error(y_test, predictions, squared=False).round(3))
print("MAE: ", mean_absolute_error(y_test, predictions).round(3))

Test score:  0.64
RMSE:  3.946
MAE:  3.173
