/
KafkaInputFormat.java
234 lines (190 loc) · 7.45 KB
/
KafkaInputFormat.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
package kafka.consumer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class KafkaInputFormat extends InputFormat<LongWritable, BytesWritable> {
static Logger LOG = LoggerFactory.getLogger(KafkaInputFormat.class);
@Override
public RecordReader<LongWritable, BytesWritable> createRecordReader(
InputSplit arg0, TaskAttemptContext arg1) throws IOException,
InterruptedException {
return new KafkaRecordReader() ;
}
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
ZkUtils zk = new ZkUtils(conf);
String topic = conf.get("kafka.topic");
String group = conf.get("kafka.groupid");
List<InputSplit> splits = new ArrayList<InputSplit>();
List<String> partitions = zk.getPartitions(topic);
for(String partition: partitions) {
String[] sp = partition.split("-");
long last = zk.getLastCommit(group, topic, partition) ;
InputSplit split = new KafkaSplit(sp[0], zk.getBroker(sp[0]), topic, Integer.valueOf(sp[1]), last);
splits.add(split);
}
zk.close();
return splits;
}
public static class KafkaSplit extends InputSplit implements Writable {
private String brokerId;
private String broker;
private int partition;
private String topic;
private long lastCommit;
public KafkaSplit() {}
public KafkaSplit(String brokerId, String broker, String topic, int partition, long lastCommit) {
this.brokerId = brokerId;
this.broker = broker;
this.partition = partition;
this.topic = topic;
this.lastCommit = lastCommit;
}
@Override
public void readFields(DataInput in) throws IOException {
brokerId = Text.readString(in);
broker = Text.readString(in);
topic = Text.readString(in);
partition = in.readInt();
lastCommit = in.readLong();
}
@Override
public void write(DataOutput out) throws IOException {
Text.writeString(out, brokerId);
Text.writeString(out, broker);
Text.writeString(out, topic);
out.writeInt(partition);
out.writeLong(lastCommit);
}
@Override
public long getLength() throws IOException, InterruptedException {
return Long.MAX_VALUE;
}
@Override
public String[] getLocations() throws IOException, InterruptedException {
return new String[] {broker};
}
public String getBrokerId() {
return brokerId;
}
public String getBroker() {
return broker;
}
public int getPartition() {
return partition;
}
public String getTopic() {
return topic;
}
public long getLastCommit() {
return lastCommit;
}
@Override
public String toString() {
return broker + "-" + topic + "-" + partition + "-" + lastCommit ;
}
}
public static class KafkaRecordReader extends RecordReader<LongWritable, BytesWritable> {
private KafkaContext kcontext;
private KafkaSplit ksplit;
private TaskAttemptContext context;
private int limit;
private LongWritable key;
private BytesWritable value;
private long start;
private long end;
private long pos;
private long count = 0L;
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
this.context = context;
ksplit = (KafkaSplit) split;
Configuration conf = context.getConfiguration();
limit = conf.getInt("kafka.limit", -1);
int timeout = conf.getInt("kafka.socket.timeout.ms", 30000);
int bsize = conf.getInt("kafka.socket.buffersize", 64*1024);
int fsize = conf.getInt("kafka.fetch.size", 1024 * 1024);
String reset = conf.get("kafka.autooffset.reset");
kcontext = new KafkaContext(ksplit.getBrokerId() + ":" + ksplit.getBroker(),
ksplit.getTopic(),
ksplit.getPartition(),
ksplit.getLastCommit(),
fsize, timeout, bsize, reset);
start = kcontext.getStartOffset();
end = kcontext.getLastOffset();
LOG.info("JobId {} {} Start: {} End: {}",
new Object[]{context.getJobID(), ksplit, start, end });
}
@Override
public void close() throws IOException {
kcontext.close();
commit();
}
private void commit() throws IOException {
if (count == 0L) return;
Configuration conf = context.getConfiguration();
ZkUtils zk = new ZkUtils(conf);
String group = conf.get("kafka.groupid");
String partition = ksplit.getBrokerId() + "-" + ksplit.getPartition();
zk.setLastCommit(group, ksplit.getTopic(), partition, pos, true);
zk.close();
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
@Override
public BytesWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (pos >= end || start == end) {
return 1.0f;
}
if (limit < 0) {
return Math.min(1.0f, (pos - start) / (float)(end - start));
} else {
return Math.min(1.0f, count / (float)limit);
}
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new LongWritable();
}
if (value == null) {
value = new BytesWritable();
}
if (limit < 0 || count < limit) {
long next = kcontext.getNext(key, value);
if (next >= 0) {
pos = next;
count++;
return true;
}
}
LOG.info("Next Offset " + pos);
return false;
}
}
}