Permalink
Browse files

Minor bugs fixed for the back end, interface added

  • Loading branch information...
neumino committed Mar 11, 2012
1 parent b424c0f commit 6c28fd52962e68b17a5142db5bc5a7dc4b00cdc2
Showing with 24,018 additions and 40 deletions.
  1. BIN PDF-to-unusual-HTML/bin/com/neumino/pdftounusualhtml/ConvertPdf.class
  2. BIN PDF-to-unusual-HTML/bin/com/neumino/pdftounusualhtml/Line.class
  3. BIN PDF-to-unusual-HTML/bin/com/neumino/pdftounusualhtml/Pdf2Json.class
  4. BIN PDF-to-unusual-HTML/bin/com/neumino/pdftounusualhtml/Structure.class
  5. BIN PDF-to-unusual-HTML/bin/com/neumino/pdftounusualhtml/Word.class
  6. +0 −2 PDF-to-unusual-HTML/src/com/neumino/pdftounusualhtml/ConvertPdf.java
  7. +4 −4 PDF-to-unusual-HTML/src/com/neumino/pdftounusualhtml/Line.java
  8. +19 −20 PDF-to-unusual-HTML/src/com/neumino/pdftounusualhtml/Pdf2Json.java
  9. +5 −5 PDF-to-unusual-HTML/src/com/neumino/pdftounusualhtml/Structure.java
  10. +2 −2 PDF-to-unusual-HTML/src/com/neumino/pdftounusualhtml/Word.java
  11. +17 −7 README
  12. +77 −0 interface/css/main.css
  13. +62 −0 interface/css/main.css~
  14. BIN interface/data/test-0.png
  15. BIN interface/data/test-1.png
  16. BIN interface/data/test-2.png
  17. BIN interface/data/test-3.png
  18. BIN interface/data/test-4.png
  19. BIN interface/data/test-5.png
  20. BIN interface/data/test.pdf
  21. +1 −0 interface/data/test_words.txt
  22. +14 −0 interface/data/words.txt~
  23. +22 −0 interface/index.html
  24. +22 −0 interface/index.html~
  25. +322 −0 interface/js.old/ZeroClipboard.js
  26. BIN interface/js.old/ZeroClipboard.swf
  27. BIN interface/js.old/ZeroClipboard10.swf
  28. +542 −0 interface/js.old/addComment.js
  29. +107 −0 interface/js.old/byRelatedAuthors.js
  30. +106 −0 interface/js.old/citedBy.js
  31. +153 −0 interface/js.old/comment.js
  32. +37 −0 interface/js.old/general.js
  33. +37 −0 interface/js.old/general.js~
  34. +172 −0 interface/js.old/graphSuggestion.js
  35. +725 −0 interface/js.old/highlight.js
  36. +7 −0 interface/js.old/jcanvas.min.js
  37. +414 −0 interface/js.old/jquery-ui.min.js
  38. +9,044 −0 interface/js.old/jquery.js
  39. +49 −0 interface/js.old/loadComment.js
  40. +145 −0 interface/js.old/loadPaper.js
  41. +156 −0 interface/js.old/loadSuggestion.js
  42. +66 −0 interface/js.old/loadWord.js
  43. +12 −0 interface/js.old/paper.js
  44. +106 −0 interface/js.old/reference.js
  45. +71 −0 interface/js.old/selection.js
  46. +197 −0 interface/js.old/side.js
  47. +428 −0 interface/js.old/suggestion.js
  48. +29 −0 interface/js.old/word.js
  49. +96 −0 interface/js/general.js
  50. +98 −0 interface/js/general.js~
  51. +414 −0 interface/js/jquery-ui.min.js
  52. +9,044 −0 interface/js/jquery.js
  53. +181 −0 interface/js/line.js
  54. +184 −0 interface/js/line.js~
  55. +207 −0 interface/js/page.js
  56. +208 −0 interface/js/page.js~
  57. +58 −0 interface/js/pdf.js
  58. +61 −0 interface/js/pdf.js~
  59. +113 −0 interface/js/selection.js
  60. +113 −0 interface/js/selection.js~
  61. +35 −0 interface/js/word.js
  62. +36 −0 interface/js/word.js~
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -5,8 +5,6 @@
import java.sql.SQLException;
public class ConvertPdf {
static String inputDirectory;
static char charDelimiter = '/';
static String pathToImagemagick = "convert";
@@ -17,11 +17,11 @@
/**
* Top left corner's x value of the selection
*/
private int startX;
private int x;
/**
* Top left corner's y value of the selection
*/
private int startY;
private int y;
/**
* Width of the line
*/
@@ -47,8 +47,8 @@
*/
public Line(int idLine, int startX, int startY, int width, int height) {
this.idLine = idLine;
this.startX = startX;
this.startY = startY;
this.x = startX;
this.y = startY;
this.width = width;
this.height = height;
}
@@ -120,7 +120,7 @@ public void convert(String pathToPdf) throws Exception {
int nbPage = allPages.size();
int exitVal = 0;
/*
int density = (int) (resolution*zoom);
String imageName = fileName+".png";
if (nbPage == 1) {
@@ -137,7 +137,7 @@ public void convert(String pathToPdf) throws Exception {
exitVal = 1;
//System.exit(1);
}
*/
if (exitVal == -2) {
System.err.println("Time out");
}
@@ -146,30 +146,27 @@ else if (exitVal != 0) {
}
else if (exitVal == 0) {
structure = new Structure();
idLine = 0;
lineMarginTop = 0;
lineMarginLeft = 0;
lineCurrentWidth = 0;
lineHeight = 0;
wordMarginLeft = 0;
wordCurrentWidth = 0;
currentFontSizePx = 0;
PDPage firstPage = (PDPage)allPages.get( 0 );
System.out.print( " Processing page: ");
for( int i=0; i<nbPage; i++ ) {
System.out.print(i+" ");
idLine = 0;
lineMarginTop = 0;
lineMarginLeft = 0;
lineCurrentWidth = 0;
lineHeight = 0;
wordMarginLeft = 0;
wordCurrentWidth = 0;
currentFontSizePx = 0;
PDPage page = (PDPage)allPages.get( i );
BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_RGB, resolution);
Page newPage = new Page((int) zoom*image.getWidth(), (int) zoom*image.getHeight(), marginTopBackground);
marginTopBackground += zoom*image.getHeight();
Page newPage = new Page((int) (zoom*image.getWidth()), (int) (zoom*image.getHeight()), marginTopBackground);
marginTopBackground += (int) (zoom*image.getHeight());
structure.add(newPage);
PDStream contents = page.getContents();
@@ -184,19 +181,21 @@ else if (exitVal == 0) {
}
}
Word newWord = new Word(StringEscapeUtils.escapeJava(currentLine.toString()), wordMarginLeft, wordCurrentWidth);
structure.addWordToLastPage(newWord);
structure.updateLastLine(lineCurrentWidth);
}
System.out.print("\n");
//Save the structure in the file
Gson gson = new Gson();
String json = gson.toJson(structure);
try{
FileWriter fstream = new FileWriter(pathToDirectory+"words.txt");
FileWriter fstream = new FileWriter(pathToDirectory+fileName+"_words.txt");
BufferedWriter out = new BufferedWriter(fstream);
out.write(json);
out.close();
@@ -11,26 +11,26 @@
*/
public class Structure {
private ArrayList<Page> structure = new ArrayList<Page>();
private ArrayList<Page> pages = new ArrayList<Page>();
public Structure() {
}
public void add(Page page){
structure.add(page);
pages.add(page);
}
public void add(Line line){
structure.get(structure.size()-1).add(line);
pages.get(pages.size()-1).add(line);
}
public void addWordToLastPage(Word word) {
Page lastPage = this.structure.get(structure.size() - 1);
Page lastPage = this.pages.get(pages.size() - 1);
lastPage.addWordToLastLine(word);
}
public void updateLastLine(int width) {
Page lastPage = this.structure.get(structure.size() - 1);
Page lastPage = this.pages.get(pages.size() - 1);
lastPage.updateLastLine(width);
}
@@ -9,13 +9,13 @@
public class Word {
private String word;
private int startX;
private int x;
private int width;
public Word(String word, int startX, int width) {
this.word = word;
this.startX = startX;
this.x = startX;
this.width = width;
}
View
24 README
@@ -6,24 +6,34 @@ I have used Apache License v2.0, so the same license probably apply.
/************** About the code **************/
Don't expect great and elegant code.
There is two parts in this project.
Everything here was coded in a rush (less than one day for this one too).
1/ The first one is the back end processing. It's done with Java, takes a PDF and do two things:
- Extract all the words with their position and save it in a .txt file with a JSON format.
- Convert the background (PDF without the text) in images with ImageMagick (because it's faster than with PDFBox).
It's almost the original source code.
This project used to crawl through directories, find pdf and convert them and save the publications converted in a MySQL database. I have removed all the code related to this.
I have removed all the unuse classes (things I tried).
This project used to crawl through directories, find pdf and convert them and save the publications converted in a MySQL database. I have removed all the code related to this. I have removed all the unuse classes (things I tried). If you are interested in, just mail me (it's free).
I'm saving everything in a file, but it would be better to store it in a proper database.
Forget about saving it in a MySQL database with one word per entry, your database won't scale.
2/ The second part is the interface. I have rewrote the core of the interface to display the images and let the user make selection.
Compare to the original version, the tree of HTML element is better balanced, the user experience is slightly better (I have spotted some bugs that I have corrected) and the sources are way more clean.
There is just the core, so it's easier if you want to adapt it. I have remove all the things like export to the clipboard, add comments etc. If you are interested in the old sources, again mail me (it's free).
/************** To come **************/
I plan to fork this project and try something more scalable.
The idea is to use something like Node.js to convert PDF files on the fly.
The idea is to to convert PDF files on the fly. For that I plan to use something like Node.js, and convert one page at a time. One of the reason I have rewrote the interface is that it's going to be easier to do it now.
If you are interested/need it, contact me, I might change/have changed my mind.
If you are interested/need it, contact me, it could be interesting.
/************** Contact **************/
Michel T.
Michel Tu
orphee@gmail.com
http://www.neumino.com
View
@@ -0,0 +1,77 @@
/* Global tags */
html{
height: 100%;
width: 100%;
min-width:1000px;
margin: 0px;
padding: 0px;
}
a {
color:#000;
text-decoration:none;
font-weight:bold;
}
a:hover {
color:#000;
text-decoration:underline;
}
body{
height: 100%;
width: 100%;
margin: 0px;
padding: 0px;
font-family: Tahoma,Arial,Helvetica,sans-serif;
}
#main{
margin: 0px;
padding: 0px;
}
#test{
position: fixed;
top: 0px;
left: 0px;
}
.page{
position: absolute;
left: 0px;
z-index: 1;
-moz-user-select: none;
-webkit-user-select: none;
/* this will work for QtWebKit in future */
-webkit-user-drag: none;
}
.selection{
position: absolute;
left: 0px;
z-index: 2;
cursor: text;
-moz-user-select: none;
-webkit-user-select: none;
/* this will work for QtWebKit in future */
-webkit-user-drag: none;
}
.highlight_text{
position: absolute;
top: 0px;
left: 0px;
background: #63B7F8;
opacity: 0.5;
z-index: 1;
-moz-user-select: none;
-webkit-user-select: none;
/* this will work for QtWebKit in future */
-webkit-user-drag: none;
}
View
@@ -0,0 +1,62 @@
/* Global tags */
html{
height: 100%;
width: 100%;
min-width:1000px;
margin: 0px;
padding: 0px;
}
a {
color:#000;
text-decoration:none;
font-weight:bold;
}
a:hover {
color:#000;
text-decoration:underline;
}
body{
height: 100%;
width: 100%;
margin: 0px;
padding: 0px;
font-family: Tahoma,Arial,Helvetica,sans-serif;
}
#main{
margin: 0px;
padding: 0px;
}
#test{
position: fixed;
top: 0px;
left: 0px;
}
.page{
position: absolute;
left: 0px;
z-index: 1;
}
.selection{
position: absolute;
left: 0px;
z-index: 2;
cursor: text;
}
.highlight_text{
position: absolute;
top: 0px;
left: 0px;
background: #63B7F8;
opacity: 0.5;
z-index: 1;
}
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -0,0 +1,22 @@
<html lang="en-US">
<head>
<meta charset="utf-8" />
<link rel="stylesheet" href="css/main.css" />
<title>PDF to HTML</title>
<script type='text/javascript' src='js/jquery.js'></script>
<script type='text/javascript' src='js/selection.js'></script>
<script type='text/javascript' src='js/pdf.js'></script>
<script type='text/javascript' src='js/word.js'></script>
<script type='text/javascript' src='js/line.js'></script>
<script type='text/javascript' src='js/page.js'></script>
<script type='text/javascript' src='js/general.js'></script>
</head>
<body>
<div id="main"><div id="test"></div>
</div>
</body>
</html>
View
@@ -0,0 +1,22 @@
<html lang="en-US">
<head>
<meta charset="utf-8" />
<link rel="stylesheet" href="css/main.css" />
<title>PDF to HTML</title>
<script type='text/javascript' src='js/jquery.js'></script>
<script type='text/javascript' src='js/pdf.js'></script>
<script type='text/javascript' src='js/word.js'></script>
<script type='text/javascript' src='js/line.js'></script>
<script type='text/javascript' src='js/page.js'></script>
<script type='text/javascript' src='js/general.js'></script>
</head>
<body>
<div id="main"><div id="test"></div>
</div>
</body>
</html>
Oops, something went wrong.

0 comments on commit 6c28fd5

Please sign in to comment.