# Create data set

We generate 1 dataset for each number of antennas at the Base-Station (BS).

## Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal as sig
import pandas as pd
import utils
from typing import Optional
from pathlib import Path
from joblib import Parallel, delayed
from time import time
from IPython.display import display, clear_output

## Functions

### Generate Channels

In [2]:
def gen_channels(n_antennas, n_users, n_eve):
    # Authentic users' channels
    H = np.sqrt(0.5)*(np.random.normal(0, 1, size=(n_antennas, n_users))
                      + 1j*np.random.normal(0, 1, size=(n_antennas, n_users)))
    
    # Eavesdropper channel
    g = np.sqrt(0.5)*(np.random.normal(0, 1, size=(n_antennas, n_eve))
                      + 1j*np.random.normal(0, 1, size=(n_antennas, n_eve)))
    
    return H, g

### Simulate Uplink

In [3]:
def simulate_uplink(n_pilot, n_antennas, n_users, n_eve, Pe, snr):
    
    # Generate channels
    Haut, g = gen_channels(int(n_antennas), int(n_users), int(n_eve))
    
    # Generate QPSK pilot symbols at the users
    b = np.random.choice([0, 1], 2*n_pilot*n_users) # The 2 is because a QPSK symbols requires 2 bits
    s = utils.qpskmodulator(b)
    xp = s.reshape(n_users, n_pilot) # Row k corresponds to the symbols of the k-th user

    # generate pilot signal at the eavesdropper:
    xpe = np.sqrt(Pe)*xp[0, :] # xp[0, :] corresponds to the pilot sequence of the first user
    
    # Concatenate signals and channels to simulate transmission
    xptx = np.concatenate((xp, [xpe]))
    H    = np.concatenate((Haut, g), axis=1) 
    
    # Transmission
    Y = np.dot(H, xptx) # Fading
    Y = utils.awgn(Y, SNR=snr) # Additive white Gaussian Noise    
    
    return Y, xp, H[:,0]

### Channel Estimation

In [4]:
def channel_estimation(Y, xp):
    Hest = np.matmul(
        np.matmul(Y, np.conjugate(xp).T), 
        np.linalg.lstsq(
            np.matmul(xp, np.conjugate(xp).T), 
            np.eye(
                np.matmul(xp, np.conjugate(xp).T).shape[0], 
                np.matmul(xp, np.conjugate(xp).T).shape[0]
            )
        )[0]
    )
    return Hest[:,0]

### Channel Estimate Energy and Threshold

In [5]:
def channel_energy(h, snr, n_antennas, n_pilot):
    N0       = 1/(10**(snr/10))
    sovertau = n_antennas*N0/n_pilot
    ln       = np.log((2+sovertau)/(1+sovertau))
    eta      = ((1 + sovertau)*(2+sovertau)*ln).real
    E        = (np.matmul(np.conjugate(h).T, h)/n_antennas).real
    
    return E, eta

### Generate Sample

In [6]:
def generate_sample(n_pilot, n_antennas, n_users, Pe, snr, n_eve, csv_path):
    
    # Uplink
    Y, xp, h = simulate_uplink(n_pilot, n_antennas, n_users, n_eve, Pe, snr)
    
    # Channel Estimation
    hest = channel_estimation(Y, xp)
    
    # Energy and threshold for hypothesys test
    E, eta = channel_energy(hest, snr, n_antennas, n_pilot)
    
    # Create label for sample
    target = True if Pe else False
    
    # Save new row in the CSV
    linhaNovaCSV = pd.DataFrame([[
        n_users,
        snr,
        E,
        eta,
        Pe, 
        target
    ]]).to_csv(csv_path, mode="a", header=False, index=False)

## Generate Training Dataset

### Fixed Parameters

In [7]:
n_pilot = 300
n_eve = 1
n_trials = 100
P = 1 # Users power
dirDatasets = Path("Data-Sets")
nJobs = 6

### Variable Parameters

In [8]:
range_antennas = np.arange(64, 257, 4)
range_users    = np.concatenate(([1], np.arange(16, 257, 16)))
range_Pe       = np.arange(0, 2.51, 0.5)
range_snr      = np.arange(-10, 31, 5)

### Balancing the cases without pilot contamination

There should be the same number of cases with and without PC

In [9]:
range_Pe = np.concatenate((np.zeros(np.count_nonzero(range_Pe>0) - np.count_nonzero(range_Pe==0)), range_Pe))

### Generating samples

In [10]:
# Initiating the CSV to save data
columns = ["n_users", "snr", "E", "eta", "Pe", "target"]
current_iteration = 1
total_iterations  = len(range_antennas) * len(range_users) * len(range_snr) * len(range_Pe)

for n_antennas in range_antennas:
    
    # Create one dataset for each number of antennas
    df_training = dirDatasets.joinpath("train_"+ str(n_antennas)+"_antennas.csv")
    df = pd.DataFrame(columns=columns).to_csv(df_training, index=False)
    
    for n_users in range_users:
        for Pe in range_Pe:
            for snr in range_snr:
                
                # PARALELIZANDO
                Parallel(n_jobs=nJobs, verbose=0)(
                    delayed(generate_sample)(
                        n_pilot, 
                        n_antennas, 
                        n_users, 
                        Pe, 
                        snr, 
                        n_eve, 
                        df_training) for trial in range(n_trials))
                
                # Print information about current iteration
                printStr  = "Number of antennas:    " + str(n_antennas) + "\n"
                printStr += "Number of users:       " + str(n_users) + "\n"                
                printStr += "Eve power, Pe:         " + str(Pe) + "\n"
                printStr += "SNR:                   " + str(snr) + "\n"
                printStr += "Progress:    " + str(100*(current_iteration/total_iterations))[:7] + "%"
                clear_output(wait=True)
                print(printStr)
                current_iteration += 1
                

Number of antennas:    256
Number of users:       256
Eve power, Pe:         2.5
SNR:                   30
Progress:    100.0%


## Generate Test Dataset

In [11]:
n_pilot = 300
n_eve = 1

In [12]:
range_antennas = np.arange(64, 257, 4)
n_users        = np.concatenate(([1], np.arange(4, 257, 4)))
range_Pe       = np.arange(0, 2.51, 0.5)
range_snr      = np.arange(-10, 31, 1)

### Balancing the cases without pilot contamination

There should be the same number of cases with and without PC

In [13]:
range_Pe = np.concatenate((np.zeros(np.count_nonzero(range_Pe>0) - np.count_nonzero(range_Pe==0)), range_Pe))

### Generating Samples

In [15]:
# Initiating the CSV to save data
columns = ["qtdUsuarios", "SNR", "E", "eta", "potenciaEspiao", "ataquePresente"]
current_iteration = 1
total_iterations  = len(range_antennas) * len(range_users) * len(range_snr) * len(range_Pe)

for n_antennas in range_antennas:
    
    # Create one dataset for each number of antennas
    df_test = dirDatasets.joinpath("test_"+ str(n_antennas)+"_antennas.csv")
    df = pd.DataFrame(columns=columns).to_csv(df_test, index=False)
    
    for n_users in range_users:
        for Pe in range_Pe:
            for snr in range_snr:
                
                # PARALELIZANDO
                Parallel(n_jobs=nJobs, verbose=0)(
                    delayed(generate_sample)(
                        n_pilot, 
                        n_antennas, 
                        n_users, 
                        Pe, 
                        snr, 
                        n_eve, 
                        df_test) for trial in range(n_trials))
                
                # Print information about current iteration
                printStr  = "Number of antennas:    " + str(n_antennas) + "\n"
                printStr += "Number of users:       " + str(n_users) + "\n"                
                printStr += "Eve power, Pe:         " + str(Pe) + "\n"
                printStr += "SNR:                   " + str(snr) + "\n"
                printStr += "Progress:    " + str(100*(current_iteration/total_iterations))[:7] + "%"
                clear_output(wait=True)
                print(printStr)
                current_iteration += 1

Number of antennas:    256
Number of users:       256
Eve power, Pe:         2.5
SNR:                   30
Progress:    100.0%
