Permalink
Browse files

Handle de-duplication of records within a single insert.

  • Loading branch information...
1 parent 872192e commit 173ad85e733c09a39152de9536892e4e130eb246 @shoe shoe committed Sep 27, 2011
Showing with 8 additions and 0 deletions.
  1. +4 −0 EPFIngester.py
  2. +4 −0 EPFParser.py
View
@@ -358,7 +358,11 @@ def _escapeRecords(self, recordList, connection=None):
conn = (connection if connection else self.connect())
escapedRecords = []
cur = conn.cursor()
+ keys = {}
for aRec in recordList:
+ marker = tuple([aRec[i] for i in self.parser.primaryKeyIndexes])
+ if marker in keys: continue
+ keys[marker] = 1
if self.isMysql:
escRec = [conn.literal(aField) for aField in aRec]
else:
View
@@ -74,6 +74,7 @@ def __init__(self, filePath, typeMap={"CLOB":"LONGTEXT"}, recordDelim='\x02\n',
self.dateTypes = ["DATE", "DATETIME", "TIME", "TIMESTAMP"]
self.columnNames = []
self.primaryKey = []
+ self.primaryKeyIndexes = []
self.dataTypes = []
self.exportMode = None
self.dateColumns = [] #fields containing dates need special treatment; we'll cache the indexes here
@@ -120,6 +121,9 @@ def __init__(self, filePath, typeMap={"CLOB":"LONGTEXT"}, recordDelim='\x02\n',
self.exportMode = self.splitRow(aRow, requiredPrefix=exStart)[0]
self.eFile.seek(0, os.SEEK_SET) #seek back to the beginning
+ for pk in self.primaryKey:
+ self.primaryKeyIndexes.append(self.columnNames.index(pk))
+
#Convert any datatypes to mapped counterparts, and cache indexes of date/time types and number types
for j in range(len(self.dataTypes)):
dType = self.dataTypes[j]

0 comments on commit 173ad85

Please sign in to comment.