forked from vitessio/vitess
-
Notifications
You must be signed in to change notification settings - Fork 0
/
restore.go
122 lines (107 loc) · 4.42 KB
/
restore.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// Copyright 2015, Google Inc. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package tabletmanager
import (
"flag"
"fmt"
log "github.com/golang/glog"
"github.com/youtube/vitess/go/vt/mysqlctl"
"github.com/youtube/vitess/go/vt/mysqlctl/replication"
"golang.org/x/net/context"
topodatapb "github.com/youtube/vitess/go/vt/proto/topodata"
)
// This file handles the initial backup restore upon startup.
// It is only enabled if restore_from_backup is set.
var (
restoreFromBackup = flag.Bool("restore_from_backup", false, "(init restore parameter) will check BackupStorage for a recent backup at startup and start there")
restoreConcurrency = flag.Int("restore_concurrency", 4, "(init restore parameter) how many concurrent files to restore at once")
)
// RestoreFromBackup is the main entry point for backup restore.
// It will either work, fail gracefully, or return
// an error in case of a non-recoverable error.
// It takes the action lock so no RPC interferes.
func (agent *ActionAgent) RestoreFromBackup(ctx context.Context) error {
agent.actionMutex.Lock()
defer agent.actionMutex.Unlock()
// change type to RESTORE (using UpdateTabletFields so it's
// always authorized)
tablet := agent.Tablet()
originalType := tablet.Type
if _, err := agent.TopoServer.UpdateTabletFields(ctx, tablet.Alias, func(tablet *topodatapb.Tablet) error {
tablet.Type = topodatapb.TabletType_RESTORE
return nil
}); err != nil {
return fmt.Errorf("Cannot change type to RESTORE: %v", err)
}
// Try to restore. Depending on the reason for failure, we may be ok.
// If we're not ok, return an error and the agent will log.Fatalf,
// causing the process to be restarted and the restore retried.
dir := fmt.Sprintf("%v/%v", tablet.Keyspace, tablet.Shard)
pos, err := mysqlctl.Restore(ctx, agent.MysqlDaemon, dir, *restoreConcurrency, agent.hookExtraEnv())
switch err {
case nil:
// Reconnect to master.
if err := agent.startReplication(ctx, pos); err != nil {
return err
}
case mysqlctl.ErrNoBackup:
log.Infof("Auto-restore is enabled, but no backups were found. Starting up empty.")
case mysqlctl.ErrExistingDB:
log.Infof("Auto-restore is enabled, but mysqld already contains data. Assuming vttablet was just restarted.")
default:
return fmt.Errorf("Can't restore backup: %v", err)
}
// Change type back to original type if we're ok to serve.
if _, err := agent.TopoServer.UpdateTabletFields(ctx, tablet.Alias, func(tablet *topodatapb.Tablet) error {
tablet.Type = originalType
return nil
}); err != nil {
return fmt.Errorf("Cannot change type back to %v: %v", originalType, err)
}
return nil
}
func (agent *ActionAgent) startReplication(ctx context.Context, pos replication.Position) error {
// Set the position at which to resume from the master.
cmds, err := agent.MysqlDaemon.SetSlavePositionCommands(pos)
if err != nil {
return err
}
if err := agent.MysqlDaemon.ExecuteSuperQueryList(cmds); err != nil {
return fmt.Errorf("failed to set slave position: %v", err)
}
// Read the shard to find the current master, and its location.
tablet := agent.Tablet()
si, err := agent.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard)
if err != nil {
return fmt.Errorf("can't read shard: %v", err)
}
if si.MasterAlias == nil {
// We've restored, but there's no master. This is fine, since we've
// already set the position at which to resume when we're later reparented.
// If we had instead considered this fatal, all tablets would crash-loop
// until a master appears, which would make it impossible to elect a master.
log.Warningf("Can't start replication after restore: shard %v/%v has no master.", tablet.Keyspace, tablet.Shard)
return nil
}
ti, err := agent.TopoServer.GetTablet(ctx, si.MasterAlias)
if err != nil {
return fmt.Errorf("Cannot read master tablet %v: %v", si.MasterAlias, err)
}
// If using semi-sync, we need to enable it before connecting to master.
if *enableSemiSync {
if err := agent.enableSemiSync(false); err != nil {
return err
}
}
// Set master and start slave.
cmds, err = agent.MysqlDaemon.SetMasterCommands(ti.Hostname, int(ti.PortMap["mysql"]))
if err != nil {
return fmt.Errorf("MysqlDaemon.SetMasterCommands failed: %v", err)
}
cmds = append(cmds, "START SLAVE")
if err := agent.MysqlDaemon.ExecuteSuperQueryList(cmds); err != nil {
return fmt.Errorf("failed to start replication: %v", err)
}
return nil
}