Skip to content

Commit 4480577

Browse files
committed
Optimize type detection
1 parent 1c2db6f commit 4480577

File tree

2 files changed

+730
-35
lines changed

2 files changed

+730
-35
lines changed

types.go

Lines changed: 246 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -319,12 +319,15 @@ func newColumnInfoListFromValues(header header, columnValues [][]string) columnI
319319
return columnInfos
320320
}
321321

322-
// Common datetime patterns to detect
323-
var datetimePatterns = []struct {
322+
// datetimePattern represents a cached datetime pattern with compiled regex
323+
type datetimePattern struct {
324324
pattern *regexp.Regexp
325325
formats []string // Multiple formats for the same pattern
326-
}{
327-
// ISO8601 formats with timezone
326+
}
327+
328+
// Cached datetime patterns for better performance
329+
var cachedDatetimePatterns = []datetimePattern{
330+
// ISO8601 formats with timezone (most common first for early termination)
328331
{
329332
regexp.MustCompile(`^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$`),
330333
[]string{time.RFC3339, time.RFC3339Nano},
@@ -373,14 +376,56 @@ var datetimePatterns = []struct {
373376
},
374377
}
375378

376-
// isDatetime checks if a string value represents a datetime
379+
// Type inference constants
380+
const (
381+
// MaxSampleSize limits how many values to sample for type inference
382+
MaxSampleSize = 1000
383+
// MinConfidenceThreshold is the minimum percentage of values that must match a type
384+
MinConfidenceThreshold = 0.8
385+
// EarlyTerminationThreshold is the percentage of text values that triggers early termination
386+
EarlyTerminationThreshold = 0.5
387+
// MinDatetimeLength is the minimum reasonable length for datetime values
388+
MinDatetimeLength = 4
389+
// MaxDatetimeLength is the maximum reasonable length for datetime values
390+
MaxDatetimeLength = 35
391+
// SamplingStratificationFactor determines when to use stratified vs simple sampling
392+
SamplingStratificationFactor = 3
393+
// MinRealThreshold is the minimum percentage of real values needed to classify as REAL
394+
MinRealThreshold = 0.1
395+
)
396+
397+
// isDatetime checks if a string value represents a datetime with optimized pattern matching
377398
func isDatetime(value string) bool {
378399
value = strings.TrimSpace(value)
379400
if value == "" {
380401
return false
381402
}
382403

383-
for _, dp := range datetimePatterns {
404+
// Quick length-based filtering to avoid regex on obviously non-datetime values
405+
valueLen := len(value)
406+
if valueLen < MinDatetimeLength || valueLen > MaxDatetimeLength {
407+
return false
408+
}
409+
410+
// Quick character check - datetime must contain at least one digit and separator
411+
hasDigit := false
412+
hasSeparator := false
413+
for _, r := range value {
414+
if r >= '0' && r <= '9' {
415+
hasDigit = true
416+
} else if r == '-' || r == '/' || r == '.' || r == ':' || r == 'T' || r == ' ' {
417+
hasSeparator = true
418+
}
419+
if hasDigit && hasSeparator {
420+
break
421+
}
422+
}
423+
if !hasDigit || !hasSeparator {
424+
return false
425+
}
426+
427+
// Test patterns with early termination
428+
for _, dp := range cachedDatetimePatterns {
384429
if dp.pattern.MatchString(value) {
385430
// Try each format for this pattern
386431
for _, format := range dp.formats {
@@ -394,63 +439,230 @@ func isDatetime(value string) bool {
394439
return false
395440
}
396441

397-
// inferColumnType infers the SQL column type from a slice of string values
442+
// inferColumnType infers the SQL column type from a slice of string values with optimized sampling
398443
func inferColumnType(values []string) columnType {
399444
if len(values) == 0 {
400445
return columnTypeText
401446
}
402447

403-
hasDatetime := false
404-
hasReal := false
405-
hasInteger := false
406-
hasText := false
448+
// Use sampling for large datasets to improve performance
449+
sampleValues := getSampleValues(values)
450+
451+
// Track type counts for confidence-based inference
452+
typeCounts := map[columnType]int{
453+
columnTypeText: 0,
454+
columnTypeDatetime: 0,
455+
columnTypeReal: 0,
456+
columnTypeInteger: 0,
457+
}
458+
459+
nonEmptyCount := 0
407460

408-
for _, value := range values {
461+
for _, value := range sampleValues {
409462
// Skip empty values for type inference
410463
value = strings.TrimSpace(value)
411464
if value == "" {
412465
continue
413466
}
467+
nonEmptyCount++
414468

415-
// Check if it's a datetime first (before checking numbers)
416-
if isDatetime(value) {
417-
hasDatetime = true
418-
continue
469+
// Determine the type of this value
470+
valueType := classifyValue(value)
471+
typeCounts[valueType]++
472+
473+
// Early termination: if too many text values, it's definitely text
474+
if typeCounts[columnTypeText] > 0 && float64(typeCounts[columnTypeText])/float64(nonEmptyCount) > EarlyTerminationThreshold {
475+
return columnTypeText
419476
}
477+
}
420478

421-
// Try to parse as integer
422-
if _, err := strconv.ParseInt(value, 10, 64); err == nil {
423-
hasInteger = true
424-
continue
479+
if nonEmptyCount == 0 {
480+
return columnTypeText
481+
}
482+
483+
// Determine the most appropriate type based on confidence thresholds
484+
return selectColumnType(typeCounts, nonEmptyCount)
485+
}
486+
487+
// getSampleValues returns a sample of values for type inference to improve performance
488+
// Uses stratified sampling to ensure better representation across the dataset
489+
func getSampleValues(values []string) []string {
490+
if len(values) <= MaxSampleSize {
491+
return values
492+
}
493+
494+
sampleSize := MaxSampleSize
495+
samples := make([]string, 0, sampleSize)
496+
497+
// For very small datasets relative to sample size, fall back to simple sampling
498+
if len(values) < sampleSize*SamplingStratificationFactor {
499+
step := max(1, len(values)/sampleSize)
500+
for i := 0; i < len(values) && len(samples) < sampleSize; i += step {
501+
samples = append(samples, values[i])
425502
}
503+
return samples
504+
}
426505

427-
// Try to parse as float
428-
if _, err := strconv.ParseFloat(value, 64); err == nil {
429-
hasReal = true
430-
continue
506+
// Stratified sampling: divide into 3 sections for better representation
507+
sectionSize := len(values) / SamplingStratificationFactor
508+
if sectionSize == 0 {
509+
// If section size is 0, fall back to simple sampling
510+
step := max(1, len(values)/sampleSize)
511+
for i := 0; i < len(values) && len(samples) < sampleSize; i += step {
512+
samples = append(samples, values[i])
431513
}
514+
return samples
515+
}
516+
517+
samplesPerSection := sampleSize / SamplingStratificationFactor
518+
remainder := sampleSize % SamplingStratificationFactor
432519

433-
// If it's not a number or datetime, it's text
434-
hasText = true
435-
break // If any value is text, the whole column is text
520+
// Ensure each section gets at least one sample if possible
521+
if samplesPerSection == 0 {
522+
samplesPerSection = 1
523+
remainder = max(0, sampleSize-SamplingStratificationFactor)
436524
}
437525

438-
// Determine the most appropriate type
439-
// Priority: TEXT > DATETIME > REAL > INTEGER
440-
if hasText {
526+
// Sample from beginning section with bounds checking
527+
beginSamples := samplesPerSection
528+
if remainder > 0 {
529+
beginSamples++
530+
remainder--
531+
}
532+
if beginSamples > 0 {
533+
step := max(1, sectionSize/beginSamples)
534+
for i := 0; i < sectionSize && len(samples) < beginSamples && i < len(values); i += step {
535+
samples = append(samples, values[i])
536+
}
537+
}
538+
539+
// Sample from middle section with bounds checking
540+
middleSamples := samplesPerSection
541+
if remainder > 0 {
542+
middleSamples++
543+
}
544+
if middleSamples > 0 {
545+
startMiddle := sectionSize
546+
step := max(1, sectionSize/middleSamples)
547+
targetSamples := len(samples) + middleSamples
548+
for i := 0; i < sectionSize && len(samples) < targetSamples; i += step {
549+
idx := startMiddle + i
550+
if idx < len(values) {
551+
samples = append(samples, values[idx])
552+
}
553+
}
554+
}
555+
556+
// Sample from end section with bounds checking
557+
endSamples := sampleSize - len(samples)
558+
if endSamples > 0 {
559+
startEnd := 2 * sectionSize
560+
if startEnd < len(values) {
561+
endSectionSize := len(values) - startEnd
562+
step := max(1, endSectionSize/endSamples)
563+
for i := 0; i < endSectionSize && len(samples) < sampleSize; i += step {
564+
idx := startEnd + i
565+
if idx < len(values) {
566+
samples = append(samples, values[idx])
567+
}
568+
}
569+
}
570+
}
571+
572+
return samples
573+
}
574+
575+
// classifyValue determines the type of a single value
576+
func classifyValue(value string) columnType {
577+
// Check if it's a datetime first (before checking numbers)
578+
if isDatetime(value) {
579+
return columnTypeDatetime
580+
}
581+
582+
// Check for integer first to avoid redundant parsing
583+
if isInteger(value) {
584+
return columnTypeInteger
585+
}
586+
587+
// Then check for float (covers non-integer numbers)
588+
if isFloat(value) {
589+
return columnTypeReal
590+
}
591+
592+
return columnTypeText
593+
}
594+
595+
// isInteger checks if a value is an integer with optimized parsing
596+
func isInteger(value string) bool {
597+
// Quick pre-check: must start with digit or sign
598+
if len(value) == 0 {
599+
return false
600+
}
601+
first := value[0]
602+
if first != '+' && first != '-' && (first < '0' || first > '9') {
603+
return false
604+
}
605+
606+
_, err := strconv.ParseInt(value, 10, 64)
607+
return err == nil
608+
}
609+
610+
// isFloat checks if a value is a float with optimized parsing
611+
func isFloat(value string) bool {
612+
// Quick pre-check: must contain digits
613+
hasDigit := false
614+
for _, r := range value {
615+
if r >= '0' && r <= '9' {
616+
hasDigit = true
617+
break
618+
}
619+
}
620+
if !hasDigit {
621+
return false
622+
}
623+
624+
_, err := strconv.ParseFloat(value, 64)
625+
return err == nil
626+
}
627+
628+
// selectColumnType selects the best column type based on confidence analysis
629+
func selectColumnType(typeCounts map[columnType]int, totalCount int) columnType {
630+
// If any text values exist with reasonable confidence, choose text
631+
if typeCounts[columnTypeText] > 0 {
441632
return columnTypeText
442633
}
443-
if hasDatetime {
634+
635+
// Calculate confidence for each type
636+
datetimeConfidence := float64(typeCounts[columnTypeDatetime]) / float64(totalCount)
637+
realConfidence := float64(typeCounts[columnTypeReal]) / float64(totalCount)
638+
integerConfidence := float64(typeCounts[columnTypeInteger]) / float64(totalCount)
639+
640+
// Choose type with highest confidence above threshold
641+
if datetimeConfidence >= MinConfidenceThreshold {
444642
return columnTypeDatetime
445643
}
446-
if hasReal {
644+
// For mixed numeric types, prefer REAL if there are significant real values
645+
// Only classify as REAL if real values make up a reasonable portion
646+
if realConfidence >= MinRealThreshold && (realConfidence+integerConfidence) >= MinConfidenceThreshold {
647+
return columnTypeReal
648+
}
649+
650+
if integerConfidence >= MinConfidenceThreshold {
651+
return columnTypeInteger
652+
}
653+
654+
// If no type has sufficient confidence, choose the most appropriate numeric type
655+
if realConfidence > 0 {
447656
return columnTypeReal
448657
}
449-
if hasInteger {
658+
if integerConfidence > 0 {
450659
return columnTypeInteger
451660
}
661+
if datetimeConfidence > 0 {
662+
return columnTypeDatetime
663+
}
452664

453-
// Default to TEXT if no values were found
665+
// Default to text if nothing else matches
454666
return columnTypeText
455667
}
456668

0 commit comments

Comments
 (0)