Permalink
Browse files

hash utility implemented

  • Loading branch information...
1 parent 7304a76 commit 6bb4cca00f5b26b3ff8e340fa5f6f5bb1ca093ef @pcalcado committed Apr 8, 2012
View
@@ -77,14 +77,32 @@
</target>
<target name="functional-test"
- depends="pack, functional-test-sedish, functional-test-cat, functional-multiple-reducers" />
+ depends="pack, functional-test-sedish, functional-test-cat, functional-test-hash" />
+
+ <target name="functional-test-hash">
+ <run-bananised-tool utility="Hash"
+ description="Hash Text->Text"
+ input-file="${test.src.dir}/testdata.log"
+ output-file="${test.bin.dir}/test-hash-output1"
+ other="--hash HTTP.1.0 --hash binary" />
+ <no-diff-between description="Text==Text"
+ one="${test.src.dir}/testdata.hash.log"
+ another="${test.bin.dir}/test-hash-output1/part-r-00000" />
- <target name="functional-multiple-reducers">
<run-bananised-tool utility="Cat"
- description="Multiple reducers"
+ description="Text->Seq"
input-file="${test.src.dir}/testdata.log"
- output-file="${test.bin.dir}/test-cat-reducers-output/"
- other="-D mapred.reduce.tasks=666" />
+ output-file="${test.bin.dir}/test-hash-output2"
+ compressed="--compressedOutput" />
+ <run-bananised-tool utility="Hash"
+ description="Hash>Text"
+ input-file="${test.bin.dir}/test-hash-output2/part-r-00000"
+ output-file="${test.bin.dir}/test-hash-output3"
+ compressed="--compressedInput"
+ other="--hash HTTP.1.0 --hash binary" />
+ <no-diff-between description="Text==Text"
+ one="${test.src.dir}/testdata.hash.log"
+ another="${test.bin.dir}/test-hash-output3/part-r-00000" />
</target>
<target name="functional-test-cat">
@@ -111,12 +129,12 @@
output-file="${test.bin.dir}/test-cat-output4"
compressed="--compressedInput" />
- <diff description="Text==Text"
- one="${test.src.dir}/testdata.log"
- another="${test.bin.dir}/test-cat-output1/part-r-00000" />
- <diff description="Text==Text"
- one="${test.src.dir}/testdata.log"
- another="${test.bin.dir}/test-cat-output4/part-r-00000" />
+ <no-diff-between description="Text==Text"
+ one="${test.src.dir}/testdata.log"
+ another="${test.bin.dir}/test-cat-output1/part-r-00000" />
+ <no-diff-between description="Text==Text"
+ one="${test.src.dir}/testdata.log"
+ another="${test.bin.dir}/test-cat-output4/part-r-00000" />
</target>
<target name="functional-test-sedish">
@@ -125,9 +143,9 @@
input-file="${test.src.dir}/testdata.log"
output-file="${test.bin.dir}/test-sedish-output1"
other="--replace biglogo --with notsoBIGlogo" />
- <diff description="Text==Text"
- one="${test.src.dir}/testdata.sedish.log"
- another="${test.bin.dir}/test-sedish-output1/part-r-00000" />
+ <no-diff-between description="Text==Text"
+ one="${test.src.dir}/testdata.sedish.log"
+ another="${test.bin.dir}/test-sedish-output1/part-r-00000" />
<run-bananised-tool utility="Cat"
description="Text->Seq"
@@ -140,9 +158,12 @@
output-file="${test.bin.dir}/test-sedish-output3"
compressed="--compressedInput"
other="--replace biglogo --with notsoBIGlogo" />
- <diff description="Text==Text"
- one="${test.src.dir}/testdata.sedish.log"
- another="${test.bin.dir}/test-sedish-output3/part-r-00000" />
+ <no-diff-between description="Text==Text"
+ one="${test.bin.dir}/test-sedish-output1/part-r-00000"
+ another="${test.bin.dir}/test-sedish-output3/part-r-00000" />
+ <no-diff-between description="Text==Text"
+ one="${test.src.dir}/testdata.sedish.log"
+ another="${test.bin.dir}/test-sedish-output3/part-r-00000" />
</target>
<macrodef name="run-bananised-tool">
@@ -161,7 +182,7 @@
</sequential>
</macrodef>
- <macrodef name="diff">
+ <macrodef name="no-diff-between">
<attribute name="description" />
<attribute name="one" />
<attribute name="another" />
@@ -12,6 +12,11 @@
Mapper<LongWritable, TInput, LongWritable, Text> {
private static final Charset UTF8 = Charset.forName("UTF-8");
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
+ super.setup(context);
+ }
+
public void map(LongWritable key, TInput value, Context context)
throws IOException, InterruptedException {
@@ -23,6 +23,7 @@
import com.soundcloud.bananiser.mr.NoOpMapper;
public abstract class BananaUtility {
+ private static final String PARAM_SEPARATOR = "::,::";
@Parameter(names = "-fs", required = false)
@SuppressWarnings("unused")
@@ -155,4 +156,20 @@ protected final boolean isCompressedOutput() {
protected void configure(Job job) {
}
+ public static String[] asParameterList(String parameterListString) {
+ return parameterListString.split(PARAM_SEPARATOR);
+ }
+
+ public static String toParameterListString(String... paramList) {
+ StringBuilder replaces = new StringBuilder();
+ for (String r : paramList) {
+ replaces.append(r).append(PARAM_SEPARATOR);
+ }
+ return replaces.toString();
+ }
+
+ public static String toParameterListString(List<String> paramList) {
+ return toParameterListString(paramList.toArray(new String[] {}));
+ }
+
}
@@ -0,0 +1,81 @@
+package com.soundcloud.bananiser.utilities.hash;
+
+import static com.soundcloud.bananiser.utilities.BananaUtility.asParameterList;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+
+import com.soundcloud.bananiser.mr.SeqFileCompatibleMapper;
+
+public class HashMapper extends SeqFileCompatibleMapper<Text> {
+ public static final String TO_REPLACE_PARAMETER = HashMapper.class
+ .getName() + ".replace";
+ private List<Pattern> patterns = new ArrayList<Pattern>();
+ private MessageDigest hasher;
+
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
+ super.setup(context);
+ Configuration configuration = context.getConfiguration();
+ String[] patternStrings = asParameterList(configuration
+ .get(TO_REPLACE_PARAMETER));
+ for (String patternString : patternStrings) {
+ patterns.add(Pattern.compile(patternString));
+ }
+ hasher = newHasher();
+ };
+
+ @Override
+ public void mapText(LongWritable key, Text value, Context context)
+ throws IOException, InterruptedException {
+ String valueAsString = value.toString();
+ for (Pattern pattern : patterns) {
+ Matcher matcher = pattern.matcher(valueAsString);
+ StringBuffer buffer = new StringBuffer();
+ while (matcher.find()) {
+ String hashed = hashNextMatch(matcher);
+ matcher.appendReplacement(buffer, hashed);
+ }
+ matcher.appendTail(buffer);
+ valueAsString = buffer.toString();
+ }
+ context.write(key, new Text(valueAsString));
+ }
+
+ private String hashNextMatch(Matcher matcher) {
+ hasher.reset();
+ String found = matcher.toMatchResult().group();
+ try {
+ hasher.update(found.getBytes("UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+
+ StringBuffer sb = new StringBuffer("");
+ for (byte b : hasher.digest()) {
+ sb.append(Integer.toString((b & 0xff) + 0x100, 32).substring(1));
+ }
+ String hashed = sb.toString();
+ return hashed;
+ }
+
+ private MessageDigest newHasher() {
+ MessageDigest hasher;
+ try {
+ hasher = MessageDigest.getInstance("SHA1");
+ } catch (NoSuchAlgorithmException e) {
+ throw new RuntimeException(e);
+ }
+ return hasher;
+ }
+}
@@ -0,0 +1,34 @@
+package com.soundcloud.bananiser.utilities.hash;
+
+import static com.soundcloud.bananiser.utilities.hash.HashMapper.TO_REPLACE_PARAMETER;
+
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import com.beust.jcommander.Parameter;
+import com.soundcloud.bananiser.utilities.BananaUtility;
+
+public class HashUtility extends BananaUtility {
+
+ @Parameter(names = "--hash", description = "regexp to replace with its hash", required = true)
+ private List<String> replaceList;
+
+ public HashUtility(String[] args) {
+ super(args);
+ }
+
+ @Override
+ protected void configure(Job job) {
+ super.configure(job);
+ job.getConfiguration().set(TO_REPLACE_PARAMETER,
+ toParameterListString(replaceList));
+ }
+
+ @SuppressWarnings("rawtypes")
+ @Override
+ protected Class<? extends Mapper> getMapperToUse() {
+ return HashMapper.class;
+ }
+}
@@ -7,15 +7,14 @@
import org.apache.hadoop.io.Text;
import com.soundcloud.bananiser.mr.SeqFileCompatibleMapper;
+import com.soundcloud.bananiser.utilities.BananaUtility;
public class SedishMapper extends SeqFileCompatibleMapper<Text> {
public static final String REPLACE_WITH_PARAMETER = SedishMapper.class
.getName() + ".replaceWith";
public static final String TO_REPLACE_PARAMETER = SedishMapper.class
.getName() + ".toReplace";
- public static final String REGEXP_SEPARATOR = "::,::";
-
private String replaceWith;
private String[] patterns;
@@ -24,8 +23,8 @@ protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Configuration configuration = context.getConfiguration();
- this.patterns = configuration.get(TO_REPLACE_PARAMETER).split(
- REGEXP_SEPARATOR);
+ this.patterns = BananaUtility.asParameterList(configuration
+ .get(TO_REPLACE_PARAMETER));
this.replaceWith = configuration.get(REPLACE_WITH_PARAMETER);
}
@@ -13,8 +13,6 @@
public class SedishUtility extends BananaUtility {
- public static final int DEFAULT_LINES = 1000;
-
@Parameter(names = "--replace", description = "regexp to replace", required = true)
private List<String> replaceList;
@@ -28,11 +26,8 @@ public SedishUtility(String[] args) {
@Override
protected void configure(Job job) {
super.configure(job);
- StringBuilder replaces = new StringBuilder();
- for (String r : replaceList) {
- replaces.append(r).append(SedishMapper.REGEXP_SEPARATOR);
- }
- job.getConfiguration().set(TO_REPLACE_PARAMETER, replaces.toString());
+ job.getConfiguration().set(TO_REPLACE_PARAMETER,
+ toParameterListString(replaceList));
job.getConfiguration().set(REPLACE_WITH_PARAMETER, with);
}
@@ -0,0 +1,56 @@
+package com.soundcloud.bananiser.mr;
+
+import static com.soundcloud.bananiser.test.BananaMatchers.sameAs;
+import static org.mockito.Matchers.argThat;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper.Context;
+
+import com.soundcloud.bananiser.mr.SeqFileCompatibleMapper;
+
+public class MapperTestCase {
+
+ protected static final LongWritable SOME_IRRELEVANT_KEY = new LongWritable(
+ 666);
+
+ public MapperTestCase() {
+ super();
+ }
+
+ @SuppressWarnings("rawtypes")
+ protected Context setupContext(Configuration configuration) {
+ Context context = mock(Context.class);
+ when(context.getConfiguration()).thenReturn(configuration);
+ return context;
+ }
+
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ protected void invokeMapOperation(SeqFileCompatibleMapper<Text> mapper,
+ String input, Context context) {
+ try {
+ mapper.setup(context);
+ mapper.mapText(SOME_IRRELEVANT_KEY, new Text(input), context);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ protected void verifyWroteTo(Context context, LongWritable keyUsed,
+ String modifiedSentence) {
+ try {
+ verify(context, times(1)).write(eq(keyUsed),
+ argThat(sameAs(new Text(modifiedSentence))));
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
Oops, something went wrong.

0 comments on commit 6bb4cca

Please sign in to comment.