Skip to content

Commit

Permalink
Process and display transcripts back to 1993
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinCaseiras committed May 21, 2014
1 parent c6d9d07 commit 976a066
Show file tree
Hide file tree
Showing 5 changed files with 344 additions and 76 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package gov.nysenate.openleg.processors;

import gov.nysenate.openleg.util.TranscriptLine;
import gov.nysenate.openleg.model.Transcript;
import gov.nysenate.openleg.util.ChangeLogger;
import gov.nysenate.openleg.util.Storage;
Expand All @@ -17,7 +18,7 @@
public class TranscriptProcessor {
private final Logger logger;

public SimpleDateFormat TRANSCRIPT_DATE_PARSER = new SimpleDateFormat("MMM dd, yyyy hhmmaa");
public SimpleDateFormat TRANSCRIPT_DATE_PARSER = new SimpleDateFormat("MMM dd yyyy hhmmaa");

public TranscriptProcessor() {
this.logger = Logger.getLogger(this.getClass());
Expand All @@ -29,43 +30,39 @@ public void process(File file, Storage storage) throws IOException {
StringBuffer fullTextProcessed = new StringBuffer();

BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "latin1"));
String line = null;
String lineText;
TranscriptLine line;
String date = null;
String time = null;
Integer contentLineNumber = 0;
while ((line = reader.readLine()) != null) {
if (line.trim().length() > 4) {
contentLineNumber += 1;
String content = line.trim().substring(2).trim();

switch (contentLineNumber) {
case 3: transcript.setLocation(content); break;
case 4: date = content; break;
case 5:
// e.g. transcripts/032611v1.TXT
time = content.replace(".", "").replace(":", "").replace(" ", "");
if (time.length() == 5) {
time = "0"+time;
}
break;
case 6: transcript.setType(content); break;
case 1:
// e.g. transcripts/061310v1.TXT
if (content.contains("STATE SENATE")) {
break; // NEW YORK STATE SENATE
}
else {
// e.g. transcripts/012109v1.TXT
contentLineNumber+=1;
}
case 2: // THE STENOGRAPHIC RECORD, sometimes split on 2 lines
if (content.equals("THE")) contentLineNumber--;
default: break;
}
fullTextProcessed.append(line.substring(2).trim()).append("\n");
boolean firstPageParsed = false;

while ((lineText = reader.readLine()) != null) {
line = new TranscriptLine(lineText);

if (!firstPageParsed) {
if (line.isLocation())
transcript.setLocation(line.textTrimmed());

if (line.isDate())
date = line.getDateString();

if (line.isTime())
time = line.getTimeString();

if (line.isSession())
transcript.setType(line.textTrimmed());

if (transcript.getLocation() != null && date != null && time != null && transcript.getType() != null)
firstPageParsed = true;
}

fullText.append(line.fullText()).append("\n");

if (line.textTrimmed().length() > 0) {
fullTextProcessed.append(line.textTrimmed()).append("\n");
}
fullText.append(line).append("\n");
}

reader.close();

try {
Expand Down
129 changes: 129 additions & 0 deletions src/main/java/gov/nysenate/openleg/util/TranscriptLine.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package gov.nysenate.openleg.util;

import java.text.ParseException;
import java.text.SimpleDateFormat;

public class TranscriptLine {

private static final int PAGE_NUM_INDEX = 10;
private static final int MAX_PAGE_NUM = 27;
private final String line;

public TranscriptLine(String line) {
this.line = line;
}

public String fullText() {
return line;
}

/**
* @return the line stripped of line number and whitespace
*/
public String textTrimmed() {
if (!isTranscriptNumber()) {
if (hasLineNumber())
return removeLineNumber().trim();
}
return line.trim();
}

/**
* Transcript number is usually right aligned at the top of each page.
* However, sometimes it's left aligned on the next line instead.
* e.g. 082895.v1, 011299.v1
* @return <code>true</code> if line contains the transcript number;
* <code>false</code> otherwise.
*/
public boolean isTranscriptNumber() {
String trim = line.trim();
if (!isNumber(trim)) {
return false;
}
int startIndex = line.indexOf(trim);
if (startIndex > PAGE_NUM_INDEX || Integer.valueOf(trim) > MAX_PAGE_NUM) {
return true;
}
return false;
}

public boolean hasLineNumber() {
// split on two spaces so time typo's don't get treated as line numbers.
return isNumber(line.trim().split(" ")[0]);
}

public String removeLineNumber() {
if (line.trim().length() < 2)
return line.trim().substring(1);

return line.trim().substring(2);
}

public boolean isLocation() {
if (line.contains("ALBANY") && line.contains("NEW") && line.contains("YORK"))
return true;

return false;
}

public boolean isDate() {
String date = textTrimmed().replace(", ", " ").replace(",", " ");
SimpleDateFormat sdf = new SimpleDateFormat("MMM dd yyyy");
try {
sdf.parse(date);
} catch (ParseException e) {
return false;
}

return true;
}

public String getDateString() {
return textTrimmed().replace(" , ", " ").replace(", ", " ").replace(",", " ").replace(".", "");
}

public boolean isTime() {
String date = getTimeString();

SimpleDateFormat sdf = new SimpleDateFormat("hhmma");
try {
sdf.parse(date);
} catch (ParseException e) {
return false;
}

return true;
}

public String getTimeString() {
String date = textTrimmed().replace(":", "").replace(".", "").replace(" ", "");

if (date.length() == 5)
date = "0" + date;

if (date.contains("Noon"))
date = date.replace("Noon", "pm");

return date;
}

public boolean isSession() {
if (line.contains("SESSION"))
return true;

return false;
}

private boolean isNumber(String text) {
try {
Integer.parseInt(text.trim());
} catch (NumberFormatException e) {
return false;
}
return true;
}

public boolean isEmpty() {
return line.replaceAll("[^a-zA-Z0-9]+","").isEmpty();
}
}
91 changes: 49 additions & 42 deletions src/main/webapp/templates/transcript.jsp
Original file line number Diff line number Diff line change
@@ -1,46 +1,54 @@
<%@ page language="java" import="java.util.*, java.util.regex.*, java.text.*,java.util.*,gov.nysenate.openleg.*,gov.nysenate.openleg.model.*,gov.nysenate.openleg.util.*" contentType="text/html" pageEncoding="utf-8"%>
<%!
public final static String TRANSCRIPT_INDENT = " ";
public final static String TRANSCRIPT_INDENT_REPLACE = "<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
public static String removeLineNumbers (String input) {
StringBuffer resp = new StringBuffer();
StringTokenizer st = new StringTokenizer (input,"\n");
String line = null;
int breakIdx = -1;
while (st.hasMoreTokens()) {
line = st.nextToken().trim();
breakIdx = line.indexOf(' ');
if (breakIdx != -1) {
line = line.substring(breakIdx+1);
if (line.startsWith("Transcription Service, Inc."))
continue;
if (line.startsWith("371-8910"))
continue;
if (line.startsWith(TRANSCRIPT_INDENT))
resp.append(TRANSCRIPT_INDENT_REPLACE);
line = line.trim();
resp.append(' ');
resp.append(line);
}
}
// Big indents ended Jan 1st 2005
public static long BIG_INDENT_END = 1104555600000L;
// Big indents started Jan 1st 1999
public static long BIG_INDENT_START = 915166800000L;
public static String removeLineNumbers (String fullText, long date) {
String htmlText = "";
String TRANSCRIPT_INDENT = " ";
String BIG_TRANSCRIPT_INDENT = " ";
String TRANSCRIPT_INDENT_REPLACE = "<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
for (String aLine: fullText.split("\n")) {
gov.nysenate.openleg.util.TranscriptLine line = new TranscriptLine(aLine);
String tmp = line.fullText();
if (line.isTranscriptNumber())
continue;
if (tmp.trim().contains("Transcription Service, Inc."))
continue;
if (tmp.trim().contains("(518)"))
continue;
if (line.hasLineNumber())
tmp = line.removeLineNumber();
// Skip blank lines.
if (tmp.trim().length() < 1)
continue;
String indent = TRANSCRIPT_INDENT;
if (date > BIG_INDENT_START && date < BIG_INDENT_END) {
indent = BIG_TRANSCRIPT_INDENT;
}
if (tmp.startsWith(indent))
htmlText += TRANSCRIPT_INDENT_REPLACE;
htmlText += " " + tmp.trim();
}
String output = resp.toString();
output = output.replace("SENATOR", "<br/>SENATOR");
output = output.replace("REVEREND", "<br/>REVEREND");
output = output.replace("ACTING", "<br/>ACTING");
output = output.replace("REGULAR SESSION", "REGULAR SESSION<br/><br/>");
htmlText = htmlText.replace("SENATOR", "<br/>SENATOR");
htmlText = htmlText.replace("REVEREND", "<br/>REVEREND");
htmlText = htmlText.replace("ACTING", "<br/>ACTING");
htmlText = htmlText.replace("REGULAR SESSION", "REGULAR SESSION<br/><br/>");
return output;
return htmlText;
}
public static String addHyperlinks (String input) {
Expand Down Expand Up @@ -100,11 +108,10 @@

<h3 class="section" ><a id="Transcript" href="#Transcript" class="anchor ui-icon ui-icon-link"></a> Transcript</h3>
<pre class='memo'> <%
String fullText = transcript.getTranscriptText().trim();
String fullText = transcript.getTranscriptText();
try {
int number = Integer.parseInt(fullText.substring(0,1));
fullText = removeLineNumbers(fullText);
fullText = removeLineNumbers(fullText, transcript.getTimeStamp().getTime());
fullText = addHyperlinks(fullText);
if (query != null && query.length()>0) {
Expand Down
14 changes: 14 additions & 0 deletions src/main/webapp/views/transcripts.jsp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@ $(document).ready(function() {
<option value="2007">2007</option>
<option value="2006">2006</option>
<option value="2005">2005</option>
<option value="2004">2004</option>
<option value="2003">2003</option>
<option value="2002">2002</option>
<option value="2001">2001</option>
<option value="2000">2000</option>
<option value="1999">1999</option>
<option value="1998">1998</option>
<option value="1997">1997</option>
<option value="1996">1996</option>
<option value="1995">1995</option>
<option value="1994">1994</option>
<option value="1993">1993</option>


</select>
<select name="month">
<option value="">All</option>
Expand Down
Loading

0 comments on commit 976a066

Please sign in to comment.