In [None]:
import os
from IPython.display import clear_output
import numpy as np
import numpy.matlib
import matplotlib.pylab as plt
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
import copy
import seaborn as sns
from tqdm import tqdm
import pandas as pd


# This class creates the environment and the agent can observe the subsequent states and rewards from it
class env:
    def __init__(self):


        # modulation schemes
        self.modulation_schemes = ['BPSK', 'QPSK', '8-PSK', '16-QAM', '32-QAM', '64-QAM', '128-QAM', '256-QAM', '512-QAM', '1024-QAM', '2048-QAM']

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 1
        self.P_success_1 = pd.DataFrame({
            'Excellent': [0.83, 0.99, 0.91, 0.79, 0.88, 0.92, 0.87, 0.91, 0.93, 0.85, 0.89],
            'Good': [0.84, 0.78, 0.81, 0.78, 0.81, 0.85, 0.80, 0.82, 0.86, 0.79, 0.83],
            'Fair': [0.89, 0.80, 0.87, 0.91, 0.88, 0.84, 0.83, 0.86, 0.90, 0.81, 0.84],
            'Poor': [0.86, 0.79, 0.81, 0.78, 0.75, 0.72, 0.74, 0.70, 0.68, 0.71, 0.69]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 2
        self.P_success_2 = pd.DataFrame({
            'Excellent': [0.72, 0.94, 0.78, 0.74, 0.79, 0.81, 0.82, 0.85, 0.83, 0.88, 0.86],
            'Good': [0.84, 0.87, 0.79, 0.71, 0.75, 0.77, 0.78, 0.80, 0.81, 0.83, 0.85],
            'Fair': [0.89, 0.67, 0.72, 0.93, 0.87, 0.85, 0.86, 0.88, 0.84, 0.82, 0.80],
            'Poor': [0.83, 0.66, 0.72, 0.73, 0.71, 0.70, 0.69, 0.68, 0.67, 0.65, 0.64]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 3
        self.P_success_3 = pd.DataFrame({
            'Excellent': [0.56, 0.82, 0.83, 0.63, 0.68, 0.72, 0.74, 0.76, 0.78, 0.80, 0.82],
            'Good': [0.61, 0.81, 0.81, 0.86, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88],
            'Fair': [0.83, 0.88, 0.61, 0.59, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.70],
            'Poor': [0.68, 0.65, 0.61, 0.89, 0.71, 0.73, 0.75, 0.77, 0.79, 0.81, 0.83]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 4
        self.P_success_4 = pd.DataFrame({
            'Excellent': [0.088, 0.089, 0.094, 0.086, 0.091, 0.092, 0.093, 0.094, 0.095, 0.096, 0.097],
            'Good': [0.088, 0.094, 0.091, 0.084, 0.087, 0.089, 0.090, 0.091, 0.092, 0.093, 0.094],
            'Fair': [0.091, 0.083, 0.096, 0.084, 0.088, 0.089, 0.090, 0.091, 0.092, 0.093, 0.094],
            'Poor': [0.081, 0.096, 0.096, 0.085, 0.086, 0.087, 0.088, 0.089, 0.090, 0.091, 0.092]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 5
        self.P_success_5 = pd.DataFrame({
            'Excellent': [0.007, 0.0075, 0.008, 0.0082, 0.0089, 0.0091, 0.0090, 0.0093, 0.0092, 0.0095, 0.0096],
            'Good':      [0.007, 0.0073, 0.0079, 0.0081, 0.0082, 0.0084, 0.0086, 0.0088, 0.0087, 0.0089, 0.0091],
            'Fair':      [0.006, 0.0065, 0.0067, 0.0076, 0.0078, 0.0080, 0.0082, 0.0083, 0.0084, 0.0085, 0.0086],
            'Poor':      [0.001, 0.002, 0.004, 0.0064, 0.0063, 0.0062, 0.0061, 0.0060, 0.0059, 0.0058, 0.0057]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 6
        self.P_success_6 = pd.DataFrame({
            'Excellent': [0.79, 0.88, 0.85, 0.90, 0.92, 0.93, 0.95, 0.94, 0.96, 0.97, 0.98],
            'Good': [0.81, 0.82, 0.84, 0.85, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93],
            'Fair': [0.76, 0.78, 0.79, 0.80, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87],
            'Poor': [0.67, 0.66, 0.65, 0.64, 0.63, 0.62, 0.61, 0.60, 0.59, 0.58, 0.57]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 7
        self.P_success_7 = pd.DataFrame({
            'Excellent': [0.82, 0.87, 0.89, 0.91, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99],
            'Good': [0.80, 0.82, 0.84, 0.85, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93],
            'Fair': [0.74, 0.76, 0.77, 0.78, 0.79, 0.80, 0.81, 0.82, 0.83, 0.84, 0.85],
            'Poor': [0.066, 0.065, 0.064, 0.063, 0.062, 0.061, 0.060, 0.059, 0.058, 0.057, 0.0056]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 8
        self.P_success_8 = pd.DataFrame({
            'Excellent': [0.85, 0.89, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.00],
            'Good': [0.82, 0.84, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94],
            'Fair': [0.78, 0.79, 0.80, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88],
            'Poor': [0.65, 0.64, 0.63, 0.62, 0.61, 0.60, 0.59, 0.58, 0.57, 0.56, 0.55]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 9
        self.P_success_9 = pd.DataFrame({
            'Excellent': [0.88, 0.92, 0.93, 0.95, 0.96, 0.97, 0.98, 0.99, 1.00, 0.99, 0.98],
            'Good': [0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94],
            'Fair': [0.80, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.90],
            'Poor': [0.64, 0.63, 0.62, 0.61, 0.60, 0.59, 0.58, 0.57, 0.56, 0.55, 0.54]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 10
        self.P_success_10 = pd.DataFrame({
            'Excellent': [0.90, 0.93, 0.94, 0.96, 0.97, 0.98, 0.99, 1.00, 0.99, 0.98, 0.97],
            'Good': [0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95],
            'Fair': [0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92],
            'Poor': [0.63, 0.62, 0.61, 0.60, 0.59, 0.58, 0.57, 0.56, 0.55, 0.54, 0.53]
        }, index=self.modulation_schemes)

        # Probablities of Successful Transmission for each pair of Modulation Scheme and Channel condition in Frequancy Band 11
        self.P_success_11 = pd.DataFrame({
            'Excellent': [0.91, 0.94, 0.95, 0.97, 0.98, 0.99, 1.00, 0.99, 0.98, 0.97, 0.96],
            'Good': [0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97],
            'Fair': [0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94],
            'Poor': [0.62, 0.61, 0.60, 0.59, 0.58, 0.57, 0.56, 0.55, 0.54, 0.53, 0.52]
        }, index=self.modulation_schemes)

        # Channel conditions
        self.Channel_condition = ['Excellent', 'Good', 'Fair', 'Poor']
        env_setting = {
            'Excellent': [0.44, 0.2, 0.66, 0.18],
            'Good': [0.11, 0.1, 0.11, 0.22],
            'Fair': [0.12, 0.3, 0.09, 0.40],
            'Poor': [0.33, 0.4, 0.14, 0.20]
        }

        self.env_setting = pd.DataFrame(env_setting, index=self.Channel_condition)
        self.q = np.array([
            # From Excellent state
            [0.44, 0.11, 0.12, 0.33],  # Excellent to [Excellent, Good, Fair, Poor]
            # From Good state
            [0.2,  0.1,  0.3,  0.4],   # Good to [Excellent, Good, Fair, Poor]
            # From Fair state
            [0.66, 0.11, 0.09, 0.14],  # Fair to [Excellent, Good, Fair, Poor]
            # From Poor state
            [0.18, 0.22, 0.40, 0.20]    # Poor to [Excellent, Good, Fair, Poor]
        ])

        # Action space 
        self.Action_space = [
            'Freq_Band_1', 'Freq_Band_2', 'Freq_Band_3', 'Freq_Band_4', 'Freq_Band_5', 'Freq_Band_6', 'Freq_Band_7', 'Freq_Band_8', 'Freq_Band_9', 'Freq_Band_10', 'Freq_Band_11']

        # DataRates 
        self.DataRates = pd.DataFrame({
            'Data Rates': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
        }, index=self.modulation_schemes)

        # DecayRates
        self.DecayRates = pd.DataFrame({
            'Decay Rates': [0.99, 0.7, 0.5, 0.3]
        }, index=self.Channel_condition)

    # It returns dynamic of the environment
    def env_dynamic(self):
        return self.dynamic
    
    # It generates harmonic number for definig state transition probability
    def generate_harmonic_numbers(self, N):
        # Calculate the harmonic sum from 1 to N
        harmonic_sum = sum(np.float64(1 / k) for k in range(1, N + 1))
        
        # Generate the numbers such that their sum is 1
        numbers = np.array([np.float64(np.float64(1 / i) / harmonic_sum) for i in range(1, N + 1)])
        
        return numbers
    
    # It creates environment
    def make_env(self):
        

        
        self.n_states = len(self.modulation_schemes)
        self.n_env_states = len(self.Channel_condition)

        # stationary distribution of environmental transition matrix 
        self.pi_env = np.array([0.38484959, 0.13775343, 0.21257006, 0.26482693], dtype=np.float64)

        self.alpha = 10
        self.beta = 2

        # Creating the reward matrix
        self.reward_matrix = pd.DataFrame(index=self.modulation_schemes, columns=['Excellent', 'Good', 'Fair', 'Poor'])
        for mcr in self.modulation_schemes:
            for env_state in ['Excellent', 'Good', 'Fair', 'Poor']:
                reward = np.float64(self.alpha*self.DataRates.loc[mcr, 'Data Rates']*self.DecayRates.loc[env_state, 'Decay Rates'] - self.beta*self.DecayRates.loc[env_state, 'Decay Rates'])
                self.reward_matrix.loc[mcr, env_state] = reward
        
        self.reward_matrix_ave = np.matmul(self.reward_matrix.to_numpy(dtype='float64'), self.pi_env)

        # Creating state transition probability given being in frequency band 1
        self.transition_probs = {}
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_1[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_1_E:"+e] = self.transition_prob_e


        # Creating P(s' | s, a) : a: frequency band 1
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_1_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_1"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 2
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_2[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_2_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 2
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_2_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_2"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 3
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_3[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_3_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 3
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_3_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_3"] = transition_probs_ave

        # Creating state transition probability given being in frequency band 4
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_4[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_4_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 4
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_4_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_4"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 5
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_5[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_5_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 5
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_5_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_5"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 6
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_6[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_6_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 6
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_6_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_6"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 7
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_7[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_7_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 7
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_7_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_7"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 8
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_8[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_8_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 8
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_8_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_8"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 9
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_9[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_9_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 9
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_9_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_9"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 10
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_10[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_10_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 10
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_10_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_10"] = transition_probs_ave


        # Creating state transition probability given being in frequency band 11
        for e in self.Channel_condition:
            self.transition_prob_e = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
            P_success_e = self.P_success_11[e]
            for idx, s in enumerate(self.modulation_schemes):
                probs = (1-P_success_e.loc[s])*self.generate_harmonic_numbers(len(self.modulation_schemes)-1)
                idx_tmp = 0
                for state in self.modulation_schemes:
                    if state == s:
                        self.transition_prob_e.loc[s, state] = P_success_e.loc[s]
                    else:
                        self.transition_prob_e.loc[s, state] = probs[idx_tmp]
                        idx_tmp += 1
            self.transition_probs["transition_prob_A:Freq_Band_11_E:"+e] = self.transition_prob_e

        # Creating P(s' | s, a) : a: frequency band 11
        transition_probs_ave = np.zeros((self.n_states,self.n_states))
        for idx, e in enumerate(self.Channel_condition):
            transition_probs_ave = transition_probs_ave + self.pi_env[idx] * self.transition_probs["transition_prob_A:Freq_Band_11_E:"+e].to_numpy(dtype='float64')
        self.transition_probs["transition_prob_A:Freq_Band_11"] = transition_probs_ave


        # Creating the initial policy
        data = {
            'Freq_Band_1': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            'Freq_Band_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_3': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_4': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_5': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_6': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_7': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_8': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_9': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_10':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_11':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        }
        self.Policy_init = pd.DataFrame(data, index=self.modulation_schemes)

        # Creating P(s' | s, e) given a fixed policy
        self.TransitionFunction_fixed_policy(self.Policy_init)
        

        self.dynamic = { 
                    "q"                                : self.q,
                    "reward_matrix"                    : self.reward_matrix,
                    "reward_matrix_ave"                : self.reward_matrix_ave,
                    "transition_probs"                 : self.transition_probs,
                    "action_space"                     : self.Action_space,
                    "pi_env"                           : self.pi_env,
                    "n_states"                         : self.n_states,
                    "n_env_states"                     : self.n_env_states,
                    "modulation_schemes"               : self.modulation_schemes,
                    "Channel_condition"                : self.Channel_condition
                    }
        

    # It returns the next state and reward 
    def observe(self,state, action):
        

        reward = self.reward_matrix.loc[state, self.state_env]

        self.state_env = np.random.choice(self.Channel_condition, 1, p=self.env_setting.loc[self.state_env].to_numpy())[0]
        
        prob = self.transition_probs["transition_prob_A:"+action+"_E:"+self.state_env].loc[state, :]
        self.state_ = np.random.choice(self.modulation_schemes, 1, p=prob)[0]


        return self.state_, reward
    
    # It resets the environment 
    def reset(self, init_state, init_state_env):
        self.state = init_state
        self.state_env = init_state_env

    # Creating P(s' | s, e) given a fixed policy
    def TransitionFunction_fixed_policy(self, policy):
        for e in self.Channel_condition:
            self.transition_probs["transition_prob_E:"+e] = pd.DataFrame(index=self.modulation_schemes, columns=self.modulation_schemes)
        for e in self.Channel_condition:
            for idx_s, state in enumerate(self.modulation_schemes):
                transition_probs_ave = np.zeros((self.n_states))
                for idx_a, action in enumerate(self.Action_space):
                    transition_probs_ave = transition_probs_ave + policy.loc[state, action] * self.transition_probs["transition_prob_A:"+action+"_E:"+e].loc[state]
                self.transition_probs["transition_prob_E:"+e].loc[state] = transition_probs_ave

# Actor class
class actor:
    def __init__(self, policy, action_space):
        self.policy = policy
        self.action_space = action_space
    def action(self, state):

        self.action_ = np.random.choice(self.action_space, 1, p=self.policy.loc[state].to_numpy(dtype='float64'))[0]

        return self.action_

# Policy improvement class
class Policy_Improvement:
    def __init__(self, action_space, state_space, reward_matrix_ave, Policy, Probs):
        self.action_space = action_space
        self.state_space = state_space
        self.Policy = Policy
        self.Probs = Probs
        self.reward_matrix_ave = reward_matrix_ave
    def Improvement(self, ValueFunction):
        
        Q_table = self.Q_function(ValueFunction)

        for state in self.state_space:
            
            # Extracting the new policy
            Q_table.loc[state, :]
            policy_tmp = Q_table.loc[state, :].to_numpy(dtype='float64') / np.sum(Q_table.loc[state, :].to_numpy())
            policy_tmp = np.zeros(len(Q_table.loc[state, :].to_numpy()))
            policy_tmp[np.argmax(Q_table.loc[state, :].to_numpy())] = 1
            self.Policy.loc[state] = policy_tmp

        return self.Policy
    
    # Calculating the Q Function
    def Q_function(self, ValueFunction):
        Q_table = pd.DataFrame(index=self.state_space, columns=self.action_space)
        for action in self.action_space:
            P_sprime_s = self.Probs["transition_prob_A:"+action]
            Q_table.loc[:, action] = self.reward_matrix_ave.reshape((len(self.state_space),1)) + np.matmul(P_sprime_s, ValueFunction.to_numpy(dtype='float64'))
        return Q_table





In [None]:
# Settings of example 1
gamma = 0.97
Rep_max = 10
itr_max = 1e+6
init_state = 'BPSK'
init_state_env = 'Excellent'




# Generating the environment
environment = env()
environment.make_env()
environment_dynamic = environment.env_dynamic()
pi_env = environment_dynamic["pi_env"]
reward_matrix = environment_dynamic["reward_matrix"].to_numpy(dtype='float64')
reward_matrix_ave = environment_dynamic["reward_matrix_ave"]
transition_probs = environment_dynamic["transition_probs"]
action_space = environment_dynamic["action_space"]
n_states = environment_dynamic["n_states"]
n_env_states = environment_dynamic["n_env_states"]
Channel_conditions = environment_dynamic["Channel_condition"]
modulation_schemes = environment_dynamic["modulation_schemes"]
Policy_init = pd.DataFrame(index=modulation_schemes, columns=action_space)


data = {
            'Freq_Band_1': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            'Freq_Band_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_3': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_4': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_5': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_6': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_7': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_8': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_9': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_10':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_11':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        }
Policy_init = pd.DataFrame(data, index=modulation_schemes)

Policy = Policy_init
Policy_estimator = Policy_Improvement(action_space, modulation_schemes, reward_matrix_ave, Policy, transition_probs)

transition_probs_ave = np.zeros((n_states,n_states))
for idx, e in enumerate(Channel_conditions):
    transition_probs_ave = transition_probs_ave + pi_env[idx] * transition_probs["transition_prob_E:"+e].to_numpy(dtype='float64')

# Optimal value function calculation
ValueFunction_opt = np.matmul(np.matmul(np.linalg.inv(np.eye(n_states) - gamma*transition_probs_ave), reward_matrix), pi_env)
print(Policy)
ValueFunction_opt_hist = []
ValueFunction_opt_mean_hist = []
for Rep in range(Rep_max):
    clear_output()
    print("Repetition "+str(int(Rep+1))+" of "+str(Rep_max))
    itr = 0
    ValueFunction_hist = []
    environment.TransitionFunction_fixed_policy(Policy)
    transition_probs = environment_dynamic["transition_probs"]
    transition_probs_ave = np.zeros((n_states,n_states))
    for idx, e in enumerate(Channel_conditions):
        transition_probs_ave = transition_probs_ave + pi_env[idx] * transition_probs["transition_prob_E:"+e].to_numpy(dtype='float64')
    ValueFunction_opt = np.matmul(np.matmul(np.linalg.inv(np.eye(n_states) - gamma*transition_probs_ave), reward_matrix), pi_env)
    ValueFunction_opt = pd.DataFrame(ValueFunction_opt, columns=['Value'])
    Policy = Policy_estimator.Improvement(ValueFunction_opt)
    print(Policy)
    ValueFunction_opt_mean_hist.append(np.mean(ValueFunction_opt.to_numpy(dtype='float64')))

ValueFunction_opt_mean_vector = np.ones(len(ValueFunction_opt_mean_hist))*np.mean(ValueFunction_opt.to_numpy(dtype='float64'))

iterations = np.arange(len(ValueFunction_opt_mean_hist))



# Set the theme for the plot
sns.set_theme()
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.5})

# Plot using seaborn with error bars for std
plt.figure(figsize=(10, 6))
sns.lineplot(x=iterations, y=ValueFunction_opt_mean_hist, label=r'$\frac{1}{|\mathcal{S}|}\sum_{s \in \mathcal{S}} v^{SNS, \mu^{k}(s)}$', color='red', marker='o')
sns.lineplot(x=iterations, y=ValueFunction_opt_mean_vector, label=r'$\frac{1}{|\mathcal{S}|}\sum_{s \in \mathcal{S}}v^{SNS}(s)$', color='black')
plt.xlabel('Number of policy updates')
plt.ylabel('Value Function')
plt.legend()
plt.show() 


In [None]:
# Settings of example 1
gamma = 0.97
Rep_max = 10
itr_max = 1e+6
init_state = 'BPSK'
init_state_env = 'Excellent'




# Generating the environment
environment = env()
environment.make_env()
environment_dynamic = environment.env_dynamic()
pi_env = environment_dynamic["pi_env"]
reward_matrix = environment_dynamic["reward_matrix"].to_numpy(dtype='float64')
reward_matrix_ave = environment_dynamic["reward_matrix_ave"]
transition_probs = environment_dynamic["transition_probs"]
action_space = environment_dynamic["action_space"]
n_states = environment_dynamic["n_states"]
n_env_states = environment_dynamic["n_env_states"]
Channel_conditions = environment_dynamic["Channel_condition"]
modulation_schemes = environment_dynamic["modulation_schemes"]
Policy_init = pd.DataFrame(index=modulation_schemes, columns=action_space)


data = {
            'Freq_Band_1': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            'Freq_Band_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_3': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_4': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_5': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_6': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_7': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_8': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_9': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_10':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'Freq_Band_11':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        }

Policy_init = pd.DataFrame(data, index=modulation_schemes)

Policy = Policy_init
Policy_estimator = Policy_Improvement(action_space, modulation_schemes, reward_matrix_ave, Policy, transition_probs)

environment.TransitionFunction_fixed_policy(Policy)

ValueFunction = pd.DataFrame(0, index=modulation_schemes, columns=['Value'])
print(Policy)
ValueFunction_hist_Reps = []
ValueFunction_mean_hist = []
for Rep in range(Rep_max):
    clear_output()
    print("Repetition "+str(int(Rep+1))+" of "+str(Rep_max))
    itr = 0
    ValueFunction = pd.DataFrame(0, index=modulation_schemes, columns=['Value'])
    ValueFunction_hist = []
    state = init_state
    Actor = actor(Policy, action_space)
    environment.reset(init_state, init_state_env)
    for itr in tqdm(range(int(itr_max)), leave=False):
        action = Actor.action(state)
        state_, reward = environment.observe(state, action)
        alpha_k = 0.005 
        current_value = ValueFunction.at[state, 'Value']
        next_value = ValueFunction.at[state_, 'Value']
        ValueFunction.at[state, 'Value'] = current_value + alpha_k * (reward + gamma * next_value - current_value)
        
        state = state_  # Update state for the next iteration
        ValueFunction_hist.append(ValueFunction['Value'].values.tolist())  # Append value history
    Policy = Policy_estimator.Improvement(ValueFunction)
    environment.TransitionFunction_fixed_policy(Policy)
    ValueFunction_hist_Reps.append(ValueFunction_hist)


ValueFunction_hist_Reps_mean = np.mean(np.array(ValueFunction_hist_Reps), axis=2)
ValueFunction_hist_Reps_mean_mean = np.mean(ValueFunction_hist_Reps_mean, axis=0)





ValueFunction_opt_mean_vector = np.ones(len(ValueFunction_hist_Reps_mean_mean))*np.mean(ValueFunction_opt)

iterations = np.arange(len(ValueFunction_hist_Reps_mean_mean))


# Set the theme for the plot
sns.set_theme()
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.5})

# Plot using seaborn with error bars for std
plt.figure(figsize=(10, 6))
sns.lineplot(x=iterations, y=ValueFunction_hist_Reps_mean_mean, label=r'$\frac{1}{|\mathcal{S}|}\sum_{s \in \mathcal{S}}v^{SNS, \mu^{k}}(s)$', color='red')
# plt.fill_between(iterations, ValueFunction_hist_Reps_mean_min,
#                  ValueFunction_hist_Reps_mean_max, color='red', alpha=0.3)


sns.lineplot(x=iterations, y=ValueFunction_opt_mean_vector, label=r'$\frac{1}{|\mathcal{S}|}\sum_{s \in \mathcal{S}}v^{SNS}(s)$', color='black')
#plt.ylim(-0.05, 0.4)
plt.xlabel('Iterations')
plt.ylabel('Value Function')
plt.legend()
plt.show() 