# Wireless Indoor Localization

Dataset is from https://archive.ics.uci.edu/dataset/422/wireless+indoor+localization

n = 2000

7 features

In [32]:
import pandas as pd

In [33]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/wifi_localization.txt"
df = pd.read_csv(url, sep = '\t', header = None)

In [34]:
df.shape

(2000, 8)

In [35]:
print(df.head())

    0   1   2   3   4   5   6  7
0 -64 -56 -61 -66 -71 -82 -81  1
1 -68 -57 -61 -65 -71 -85 -85  1
2 -63 -60 -60 -67 -76 -85 -84  1
3 -61 -60 -68 -62 -77 -90 -80  1
4 -63 -65 -60 -63 -77 -81 -87  1


In [36]:
from sklearn import svm
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [37]:
X = df.drop(7, axis=1)
y = df[7]

In [38]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-64,-56,-61,-66,-71,-82,-81
1,-68,-57,-61,-65,-71,-85,-85
2,-63,-60,-60,-67,-76,-85,-84
3,-61,-60,-68,-62,-77,-90,-80
4,-63,-65,-60,-63,-77,-81,-87


In [39]:
y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1})
y.value_counts()

0    1000
1    1000
Name: 7, dtype: int64

## Logistic Regression

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create and train the logistic regression model
#model = LogisticRegression()
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Calculate accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print("Test Accuracy:", accuracy)
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9216666666666666
Test Accuracy: 0.93125


# Divide and Conquer

Divide the training data into 11 batches, train a logistic model on each of the batch, and then combine the 11 prediction results. Consider the following two ensemble methods:
- majority voting
- average (or sum) of the logit output and then make decision based on its sign

In [53]:
def iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    #indices = np.random.permutation(len(X))
    
    #change 7/12
    indices = np.random.permutation(len(X_train))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_logit = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))

    
    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Create a logistic regression model
        model = LogisticRegression(max_iter=500)
        
        # Select the current batch for training
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]
        
        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)
        
        # Fit the model on the current batch
        model.fit(X_batch_scaled, y_batch)
               
        y_pred = model.predict(X_test_scaled)
        # Accumulate the predictions using majority voting
        preds_voting += (y_pred == 1)
    
        # Accumulate the predictions using majority voting
        y_pred = model.decision_function(X_test_scaled)
        preds_logit += y_pred
        
        #Accumulate the probs
        y_pred = model.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
   
    
    accuracy = np.zeros(4)
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    # Average of logit
    final_predictions = np.where(preds_logit > 0, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    
    # Train a model on all 11 batches of training data
    model = LogisticRegression(max_iter=500)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy[3] = accuracy_score(y_test, y_pred)
    
    return accuracy

Try the divide and conquer approaches 10 times, reporting the following error matrix. 
- 10-by-4
- col_1: majority voting
- col_2: average the logit
- col_3: average probabilities
- col_4: using the model trained on all the training data

In [54]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 4))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i] = iterate_process(X, y)
    seed += 2

# Print the accuracies
print("Accuracies:", accuracies)

Accuracies: [[0.9325  0.9325  0.9325  0.93375]
 [0.91    0.9125  0.9125  0.90625]
 [0.9325  0.93125 0.93    0.9325 ]
 [0.9225  0.92125 0.92125 0.9175 ]
 [0.92375 0.92375 0.92375 0.92   ]
 [0.92125 0.92125 0.92125 0.915  ]
 [0.92    0.92375 0.9225  0.9225 ]
 [0.925   0.92625 0.92625 0.9225 ]
 [0.94125 0.94    0.94    0.94   ]
 [0.9275  0.92625 0.92625 0.92125]
 [0.92625 0.9225  0.92125 0.9225 ]
 [0.9225  0.92    0.92    0.9175 ]
 [0.9225  0.9275  0.9275  0.9275 ]
 [0.935   0.935   0.935   0.93   ]
 [0.92875 0.9275  0.9275  0.9275 ]
 [0.9075  0.91    0.91    0.91375]
 [0.92375 0.9275  0.9275  0.92125]
 [0.9175  0.9175  0.9175  0.91625]
 [0.93375 0.93    0.93    0.92625]
 [0.92625 0.93125 0.93125 0.92875]
 [0.92875 0.92625 0.92625 0.93125]
 [0.9275  0.92875 0.93    0.93   ]
 [0.91625 0.92    0.92    0.92   ]
 [0.93    0.92875 0.93    0.93   ]
 [0.9225  0.9175  0.9175  0.92125]
 [0.93    0.92875 0.92875 0.92875]
 [0.9275  0.92625 0.92625 0.93   ]
 [0.93125 0.93125 0.93125 0.92875]
 [0.9237

In [55]:
np.mean(accuracies, axis = 0)

array([0.924325, 0.924525, 0.924475, 0.9231  ])

In [56]:
np.std(accuracies, axis = 0)

array([0.00771852, 0.00666094, 0.00665249, 0.00688858])