# 1. Install and import the game and other dependencies

In [None]:
#Install super_mario_bros game and nes_py which is a emulator for the game
!pip install gym_super_mario_bros==7.3.0 nes_py

In [None]:
#Install the OpenAI gym environment
pip install gym==0.17.2

In [None]:
#Import the super mario game in the notebook
import gym_super_mario_bros

#Import the Joypad wrapper in the notebook
from nes_py.wrappers import JoypadSpace

#Import the simple controls so that the model just needs to control some movements of our agent (here Mario)
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT


In [None]:
#Install Pytorch (check the version suitable for your system)
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2

In [None]:
#Install stable-baselines3 library which contains many RL algorithms which we need to train our model
!pip install stable-baselines3[extra]

In [None]:
#Changes the game from colour image (RGB) to grayscale so that our processing becomes faster as we need to deal with less data 
from gym.wrappers import GrayScaleObservation

#VecFrameStack allows us to work with our stacked enviroments by letting us know the information of previous frames. DummyVecEnv transforms our model so that we can pass it to our AI model. 
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv


# 2. Preprocessing the environment

In [None]:
# 1.Make the base game environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')

# 2. Load the Simplified controler with Joypad wrapper in our game so that we just have few actions to take care of
env = JoypadSpace(env, SIMPLE_MOVEMENT)

# 3. Grayscale the environment to make our processing faster
env = GrayScaleObservation(env, keep_dim=True)

# 4. Wrap inside the Dummy environment
env = DummyVecEnv([lambda:env])

# 5. Stack 4 frames of our environment and channels_order="last" is for stacking along the last dimension
env = VecFrameStack(env, 4, channels_order="last")


# 3. Build and Train the RL Model

##### To train our RL model(Our AI) we are going to use PPO (Proximal Policy Optimization) Algorithm. 

In [None]:
# Import os for file path management
import os

# Import PPO algorithm to train our model
from stable_baselines3 import PPO

# Import Base Callback for saving models and to continue from there
from stable_baselines3.common.callbacks import BaseCallback


In [None]:
#Location of trained and logged files 
CHECKPOINT_DIR = './train'
LOG_DIR = './logs'

In [None]:
#Set the clipping range
def custom_clip_range(a):
    a = 0.2
    return a  

#Set the learning rate
def custom_lr_schedule(lr):
    lr = 0.000003
    return lr 

In [None]:
#Specify the trainnig files and logging files location
class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        # Save the model and track training progress
        if self.num_timesteps % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.num_timesteps))
            self.model.save(model_path)

        return True

# Check if a previously trained model exists
if os.path.exists('./train/best_model.zip'):
    # Load the pre-trained model
    model_start = PPO.load('./train/best_model.zip', env, tensorboard_log=LOG_DIR, custom_objects={'clip_range': custom_clip_range, 'learning_rate': custom_lr_schedule})
    
    # Get the total number of steps completed during the previous training
    total_steps_completed = model_start.num_timesteps
    
    model = PPO.load('./train/best_model.zip', env, tensorboard_log=LOG_DIR, custom_objects={'clip_range': custom_clip_range, 'learning_rate': custom_lr_schedule})

    # Adjust the starting step count and the total number of training steps
    starting_step = total_steps_completed + 1
    total_training_steps = starting_step + 100000  # Resume training for 100,000 steps
else:
    # Create a new model if no pre-trained model exists
    model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=custom_lr_schedule, n_steps=512)
    
    
    # Set the starting step count and the total number of training steps
    starting_step = 1
    total_training_steps = 100000  # Train for 100,000 steps


In [None]:
# Call back the trained and logged model after every 5000 steps (takes 150MB space for one run logged data for 5k steps) and save to CHECKPOINT_DIR.
callback = TrainAndLoggingCallback(check_freq=5000, save_path=CHECKPOINT_DIR)


In [None]:
# Train the AI model, this is where the AI model starts to learn
model.learn(total_timesteps=total_training_steps, callback=callback, reset_num_timesteps=False)


# 4. Combining the Model (AI)

In [None]:
# Load both the models you want to combine
model1 = PPO.load('./train/best_model.zip', env, custom_objects={'clip_range': custom_clip_range, 'learning_rate': custom_lr_schedule})
model2 = PPO.load('./train/best_model_500000.zip', env, custom_objects={'clip_range': custom_clip_range, 'learning_rate': custom_lr_schedule})

# Assign weights for both the models
weight_model1 = 0.6  # Weight for model 1
weight_model2 = 0.4  # Weight for model 2

# Get the policy parameters from both models
policy_params1 = model1.policy.state_dict()
policy_params2 = model2.policy.state_dict()



In [None]:
# Combine the policy parameters with the specified weights
combined_policy_params = {}
for param_name in policy_params1.keys():
    combined_policy_params[param_name] = weight_model1 * policy_params1[param_name] + weight_model2 * policy_params2[param_name]


In [None]:
# Create a new model with the combined policy parameters
combined_model = PPO('CnnPolicy', env=model1.env) #model1.policy
combined_model.policy.load_state_dict(combined_policy_params)

# Save the new combined model in the train directory
combined_model.save("./train/combined_model_best*500000.zip")


# 5. Testing the model (AI)

In [None]:
# Load the new combined model
combined_model = PPO.load('./train/combined_model_best*500000', custom_objects={'clip_range': custom_clip_range, 'learning_rate': custom_lr_schedule})

In [None]:
#Starting our game
state = env.reset()

#Loop through the game
while True:
    # we are getting two values of which we need only one, so we put a underscore to just assign it the extra value
    action, _ = combined_model.predict(state)
    action, reward, done, info = env.step(action)
    env.render()

To stop the loop, that is the game, press the "interrupt the kernel" button shown by a black square next to "Run"

In [None]:
#To close the game environment
env.close()