# **Solve the CartPole Environment Problem Using Q-Learning**

## **Problem Statement**

Solve the control problem in the CartPole environment using the Q-learning method. While solving the CartPole environment, run it for 500 episodes (i.e., epochs), where the epsilon is a fraction of 1/sqrt(n+1) and the discount factor is 0.99. Also, plot all the rewards and their running averages for all the episodes.

### **Environment**

It is a CartPole environment where the task is to train the agent to keep the pole upright for as long as possible.

Environment looks like this:
![CartPole](https://drive.google.com/uc?id=1GGcPDgYpAuNRGdUtZprNu1AZaT2VFP30)

**States in the environment are:**
* Position of the cart (can be random)
* Velocity of the cart
* Angle of the pole
* Angular velocity of the pole
    

**Actions performed by the agent:**<br>
* Move the cart left
* Move the cart right




## **Solution:**

### **Import Libraries and Environment**

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler

## **Preprocessing Steps**

In [2]:
# Minimizing a regularized empirical loss, pre-processing, and optimizing the environment using sklearn.
class SGDRegressor:
  def __init__(self, D):
    self.w = np.random.randn(D) / np.sqrt(D)
    self.lr = 0.1

  def partial_fit(self, X, Y):
    self.w += self.lr*(Y - X.dot(self.w)).dot(X)

  def predict(self, X):
    return X.dot(self.w)


class FeatureTransformer:
  def __init__(self, env):
  
    observation_examples = np.random.random((20000, 4))*2 - 1
    scaler = StandardScaler()
    scaler.fit(observation_examples)

    # Converting a state to a feature represenation by using RBF kernels with different variances to cover different parts of the space.
    featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=0.05, n_components=1000)),
            ("rbf2", RBFSampler(gamma=1.0, n_components=1000)),
            ("rbf3", RBFSampler(gamma=0.5, n_components=1000)),
            ("rbf4", RBFSampler(gamma=0.1, n_components=1000))
            ])
    feature_examples = featurizer.fit_transform(scaler.transform(observation_examples))

    self.dimensions = feature_examples.shape[1]
    self.scaler = scaler
    self.featurizer = featurizer

  def transform(self, observations):
    scaled = self.scaler.transform(observations)
    return self.featurizer.transform(scaled)


# Holding one SGDRegressor for each action
class Model:
  def __init__(self, env, feature_transformer):
    self.env = env
    self.models = []
    self.feature_transformer = feature_transformer
    for i in range(env.action_space.n):
      model = SGDRegressor(feature_transformer.dimensions)
      self.models.append(model)

  def predict(self, s):
    X = self.feature_transformer.transform(np.atleast_2d(s))
    result = np.stack([m.predict(X) for m in self.models]).T
    return result

  def update(self, s, a, G):
    X = self.feature_transformer.transform(np.atleast_2d(s))
    self.models[a].partial_fit(X, [G])

  def sample_action(self, s, eps):
    if np.random.random() < eps:
      return self.env.action_space.sample()
    else:
      return np.argmax(self.predict(s))


## **Find total rewards**

In [3]:
def play_one(env, model, eps, gamma):
  observation = env.reset()
  done = False
  totalreward = 0
  iters = 0
  while not done and iters < 2000:
    # Quiting after reaching 2000 steps 
    action = model.sample_action(observation, eps)
    prev_observation = observation
    observation, reward, done, info = env.step(action)

    if done:
      reward = -200

    # Updating the model
    next = model.predict(observation)
    assert(next.shape == (1, env.action_space.n))
    G = reward + gamma*np.max(next)
    model.update(prev_observation, action, G)

    if reward == 1: # if we changed the reward to -200
      totalreward += reward
    iters += 1

  return totalreward




## **Functions to Plot**

In [4]:
#Plot function
def plot_running_avg(totalrewards):
  N = len(totalrewards)
  running_avg = np.empty(N)
  for t in range(N):
    running_avg[t] = totalrewards[max(0, t-100):(t+1)].mean()
  plt.plot(running_avg)
  plt.title("Running Average")
  plt.show()

## **Apply model on the environment**

In [None]:
#Running the environment for 500 episodes and where epsilon is a fraction of 1/sqrt(n+1) and discount factor is 0.99
def main():
  env = gym.make('CartPole-v1')
  ft = FeatureTransformer(env)
  model = Model(env, ft)
  gamma = 0.99

  N = 500
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    eps = 1.0/np.sqrt(n+1)
    totalreward = play_one(env, model, eps, gamma)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)


if __name__ == '__main__':
  main()

  logger.warn(


episode: 0 total reward: 10.0 eps: 1.0 avg reward (last 100): 10.0
episode: 100 total reward: 180.0 eps: 0.09950371902099892 avg reward (last 100): 121.7128712871287
