diff --git a/docs/component/estimator.rst b/docs/component/estimator.rst
index 39284448ce..917d73c138 100644
--- a/docs/component/estimator.rst
+++ b/docs/component/estimator.rst
@@ -574,10 +574,24 @@ The `qlib_data` field describes the parameters of qlib initialization.
       region: "cn"
     
 - `provider_uri`
-    The local directory where the data loaded by 'get_data.py' is stored.
+    Type: str. The URI of the Qlib data. For example, it could be the location where the data loaded by ``get_data.py`` are stored.
 - `region`
-    - If region == ``qlib.config.REG_CN``, 'qlib' will be initialized in US-stock mode. 
-    - If region == ``qlib.config.REG_US``, 'qlib' will be initialized in china-stock mode.
+    - If `region` == "us", ``Qlib`` will be initialized in US-stock mode. 
+    - If `region` == "cn", ``Qlib`` will be initialized in china-stock mode.
+- `redis_host`
+    Type: str, optional parameter(default: "127.0.0.1"), host of `redis`
+        The lock and cache mechanism relies on redis.
+- `redis_port`
+    Type: int, optional parameter(default: 6379), port of `redis`
+
+    .. note:: 
+        
+        The value of `region` should be aligned with the data stored in `provider_uri`. Currently, ``scripts/get_data.py`` only provides China stock market data. If users want to use the US stock market data, they should prepare their own US-stock data in `provider_uri` and switch to US-stock mode.
+
+    .. note::
+        
+        If Qlib fails to connect redis via `redis_host` and `redis_port`, cache mechanism will not be used! Please refer to `Cache <data.html#cache>`_ for details.
+
 
 Please refer to `Initialization <../start/initialization.html>`_.
 
diff --git a/docs/start/initialization.rst b/docs/start/initialization.rst
index 00d5dd4775..e34ab82fed 100644
--- a/docs/start/initialization.rst
+++ b/docs/start/initialization.rst
@@ -44,7 +44,7 @@ Besides `provider_uri` and `region`, `qlib.init` has other parameters. The follo
         - ``qlib.config.REG_US``: US stock market.
         - ``qlib.config.REG_CN``: China stock market.
 
-        Different modse will result in different trading limitations and costs.
+        Different modes will result in different trading limitations and costs.
 - `redis_host`
     Type: str, optional parameter(default: "127.0.0.1"), host of `redis`
         The lock and cache mechanism relies on redis.
diff --git a/examples/estimator/estimator_config_dnn.yaml b/examples/estimator/estimator_config_dnn.yaml
index 9c0e859438..a4a9d18ffb 100644
--- a/examples/estimator/estimator_config_dnn.yaml
+++ b/examples/estimator/estimator_config_dnn.yaml
@@ -48,8 +48,6 @@ backtest:
     open_cost: 0.0005
     close_cost: 0.0015
     min_cost: 5
-  long_short_backtest_args:
-    topk: 50
 
 qlib_data:
   # when testing, please modify the following parameters according to the specific environment
diff --git a/qlib/config.py b/qlib/config.py
index ce905dc832..72a03bf6d9 100644
--- a/qlib/config.py
+++ b/qlib/config.py
@@ -61,7 +61,7 @@ def update(self, *args, **kwargs):
 
 # REGION CONST
 REG_CN = "cn"
-REG_US = "US"
+REG_US = "us"
 
 _default_config = {
     # data provider config
diff --git a/qlib/contrib/model/pytorch_nn.py b/qlib/contrib/model/pytorch_nn.py
index 6bf74e148a..e66212b361 100644
--- a/qlib/contrib/model/pytorch_nn.py
+++ b/qlib/contrib/model/pytorch_nn.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         input_dim,
         output_dim,
-        layers=(256, 256, 128),
+        layers=(256, 512, 768, 1024, 768, 512, 256, 128, 64),
         lr=0.001,
         max_steps=300,
         batch_size=2000,
@@ -76,6 +76,7 @@ def __init__(
         self.optimizer = optimizer.lower()
         self.loss_type = loss
         self.visible_GPU = GPU
+        self.use_gpu = torch.cuda.is_available()
 
         self.logger.info(
             "DNN parameters setting:"
@@ -90,7 +91,8 @@ def __init__(
             "\noptimizer : {}"
             "\nloss_type : {}"
             "\neval_steps : {}"
-            "\nvisible_GPU : {}".format(
+            "\nvisible_GPU : {}"
+            "\nuse_GPU : {}".format(
                 layers,
                 lr,
                 max_steps,
@@ -103,6 +105,7 @@ def __init__(
                 loss,
                 eval_steps,
                 GPU,
+                self.use_gpu,
             )
         )
 
@@ -133,11 +136,11 @@ def __init__(
         )
 
         self._fitted = False
-        self.dnn_model.cuda()
-
-        # set the visible GPU
-        if self.visible_GPU:
-            os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
+        if self.use_gpu:
+            self.dnn_model.cuda()
+            # set the visible GPU
+            if self.visible_GPU:
+                os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
 
     def fit(
         self,
@@ -175,13 +178,14 @@ def fit(
         train_num = y_train_values.shape[0]
 
         # prepare validation data
-        x_val_cuda = torch.from_numpy(x_valid.values).float()
-        y_val_cuda = torch.from_numpy(y_valid.values).float()
-        w_val_cuda = torch.from_numpy(w_valid.values).float()
+        x_val_auto = torch.from_numpy(x_valid.values).float()
+        y_val_auto = torch.from_numpy(y_valid.values).float()
+        w_val_auto = torch.from_numpy(w_valid.values).float()
 
-        x_val_cuda = x_val_cuda.cuda()
-        y_val_cuda = y_val_cuda.cuda()
-        w_val_cuda = w_val_cuda.cuda()
+        if self.use_gpu:
+            x_val_auto = x_val_auto.cuda()
+            y_val_auto = y_val_auto.cuda()
+            w_val_auto = w_val_auto.cuda()
 
         for step in range(self.max_steps):
             if stop_steps >= self.early_stop_rounds:
@@ -193,17 +197,18 @@ def fit(
             self.train_optimizer.zero_grad()
 
             choice = np.random.choice(train_num, self.batch_size)
-            x_batch = x_train_values[choice]
-            y_batch = y_train_values[choice]
-            w_batch = w_train_values[choice]
+            x_batch_auto = x_train_values[choice]
+            y_batch_auto = y_train_values[choice]
+            w_batch_auto = w_train_values[choice]
 
-            x_batch_cuda = x_batch.float().cuda()
-            y_batch_cuda = y_batch.float().cuda()
-            w_batch_cuda = w_batch.float().cuda()
+            if self.use_gpu:
+                x_batch_auto = x_batch_auto.float().cuda()
+                y_batch_auto = y_batch_auto.float().cuda()
+                w_batch_auto = w_batch_auto.float().cuda()
 
             # forward
-            preds = self.dnn_model(x_batch_cuda)
-            cur_loss = self.get_loss(preds, w_batch_cuda, y_batch_cuda, self.loss_type)
+            preds = self.dnn_model(x_batch_auto)
+            cur_loss = self.get_loss(preds, w_batch_auto, y_batch_auto, self.loss_type)
             cur_loss.backward()
             self.train_optimizer.step()
             loss.update(cur_loss.item())
@@ -220,8 +225,8 @@ def fit(
                     loss_val = AverageMeter()
 
                     # forward
-                    preds = self.dnn_model(x_val_cuda)
-                    cur_loss_val = self.get_loss(preds, w_val_cuda, y_val_cuda, self.loss_type)
+                    preds = self.dnn_model(x_val_auto)
+                    cur_loss_val = self.get_loss(preds, w_val_auto, y_val_auto, self.loss_type)
                     loss_val.update(cur_loss_val.item())
                 if verbose:
                     self.logger.info(
@@ -245,7 +250,8 @@ def fit(
 
         # restore the optimal parameters after training ??
         self.dnn_model.load_state_dict(torch.load(save_path))
-        torch.cuda.empty_cache()
+        if self.use_gpu:
+            torch.cuda.empty_cache()
 
     def get_loss(self, pred, w, target, loss_type):
         if loss_type == "mse":
@@ -261,11 +267,16 @@ def get_loss(self, pred, w, target, loss_type):
     def predict(self, x_test):
         if not self._fitted:
             raise ValueError("model is not fitted yet!")
-        x_test = torch.from_numpy(x_test.values).float().cuda()
+        x_test = torch.from_numpy(x_test.values).float()
+        if self.use_gpu:
+            x_test = x_test.cuda()
         self.dnn_model.eval()
         
         with torch.no_grad():
-            preds = self.dnn_model(x_test).detach().cpu().numpy()
+            if self.use_gpu:
+                preds = self.dnn_model(x_test).detach().cpu().numpy()
+            else:
+                preds = self.dnn_model(x_test).detach().numpy()
         return preds
 
     def score(self, x_test, y_test, w_test=None):
@@ -316,20 +327,20 @@ def update(self, val, n=1):
 
 
 class Net(nn.Module):
-    def __init__(self, input_dim, output_dim, layers=(256, 256, 256), loss="mse"):
+    def __init__(self, input_dim, output_dim, layers=(256, 512, 768, 512, 256, 128, 64), loss="mse"):
         super(Net, self).__init__()
         layers = [input_dim] + list(layers)
         dnn_layers = []
-        drop_input = nn.Dropout(0.1)
+        drop_input = nn.Dropout(0.05)
         dnn_layers.append(drop_input)
         for i, (input_dim, hidden_units) in enumerate(zip(layers[:-1], layers[1:])):
             fc = nn.Linear(input_dim, hidden_units)
             activation = nn.ReLU()
             bn = nn.BatchNorm1d(hidden_units)
-            drop = nn.Dropout(0.1)
-            seq = nn.Sequential(fc, bn, activation, drop)
+            seq = nn.Sequential(fc, bn, activation)
             dnn_layers.append(seq)
-
+        drop_input = nn.Dropout(0.05)
+        dnn_layers.append(drop_input)
         if loss == "mse":
             fc = nn.Linear(hidden_units, output_dim)
             dnn_layers.append(fc)