@@ -319,12 +319,15 @@ func newColumnInfoListFromValues(header header, columnValues [][]string) columnI
319
319
return columnInfos
320
320
}
321
321
322
- // Common datetime patterns to detect
323
- var datetimePatterns = [] struct {
322
+ // datetimePattern represents a cached datetime pattern with compiled regex
323
+ type datetimePattern struct {
324
324
pattern * regexp.Regexp
325
325
formats []string // Multiple formats for the same pattern
326
- }{
327
- // ISO8601 formats with timezone
326
+ }
327
+
328
+ // Cached datetime patterns for better performance
329
+ var cachedDatetimePatterns = []datetimePattern {
330
+ // ISO8601 formats with timezone (most common first for early termination)
328
331
{
329
332
regexp .MustCompile (`^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$` ),
330
333
[]string {time .RFC3339 , time .RFC3339Nano },
@@ -373,14 +376,56 @@ var datetimePatterns = []struct {
373
376
},
374
377
}
375
378
376
- // isDatetime checks if a string value represents a datetime
379
+ // Type inference constants
380
+ const (
381
+ // MaxSampleSize limits how many values to sample for type inference
382
+ MaxSampleSize = 1000
383
+ // MinConfidenceThreshold is the minimum percentage of values that must match a type
384
+ MinConfidenceThreshold = 0.8
385
+ // EarlyTerminationThreshold is the percentage of text values that triggers early termination
386
+ EarlyTerminationThreshold = 0.5
387
+ // MinDatetimeLength is the minimum reasonable length for datetime values
388
+ MinDatetimeLength = 4
389
+ // MaxDatetimeLength is the maximum reasonable length for datetime values
390
+ MaxDatetimeLength = 35
391
+ // SamplingStratificationFactor determines when to use stratified vs simple sampling
392
+ SamplingStratificationFactor = 3
393
+ // MinRealThreshold is the minimum percentage of real values needed to classify as REAL
394
+ MinRealThreshold = 0.1
395
+ )
396
+
397
+ // isDatetime checks if a string value represents a datetime with optimized pattern matching
377
398
func isDatetime (value string ) bool {
378
399
value = strings .TrimSpace (value )
379
400
if value == "" {
380
401
return false
381
402
}
382
403
383
- for _ , dp := range datetimePatterns {
404
+ // Quick length-based filtering to avoid regex on obviously non-datetime values
405
+ valueLen := len (value )
406
+ if valueLen < MinDatetimeLength || valueLen > MaxDatetimeLength {
407
+ return false
408
+ }
409
+
410
+ // Quick character check - datetime must contain at least one digit and separator
411
+ hasDigit := false
412
+ hasSeparator := false
413
+ for _ , r := range value {
414
+ if r >= '0' && r <= '9' {
415
+ hasDigit = true
416
+ } else if r == '-' || r == '/' || r == '.' || r == ':' || r == 'T' || r == ' ' {
417
+ hasSeparator = true
418
+ }
419
+ if hasDigit && hasSeparator {
420
+ break
421
+ }
422
+ }
423
+ if ! hasDigit || ! hasSeparator {
424
+ return false
425
+ }
426
+
427
+ // Test patterns with early termination
428
+ for _ , dp := range cachedDatetimePatterns {
384
429
if dp .pattern .MatchString (value ) {
385
430
// Try each format for this pattern
386
431
for _ , format := range dp .formats {
@@ -394,63 +439,230 @@ func isDatetime(value string) bool {
394
439
return false
395
440
}
396
441
397
- // inferColumnType infers the SQL column type from a slice of string values
442
+ // inferColumnType infers the SQL column type from a slice of string values with optimized sampling
398
443
func inferColumnType (values []string ) columnType {
399
444
if len (values ) == 0 {
400
445
return columnTypeText
401
446
}
402
447
403
- hasDatetime := false
404
- hasReal := false
405
- hasInteger := false
406
- hasText := false
448
+ // Use sampling for large datasets to improve performance
449
+ sampleValues := getSampleValues (values )
450
+
451
+ // Track type counts for confidence-based inference
452
+ typeCounts := map [columnType ]int {
453
+ columnTypeText : 0 ,
454
+ columnTypeDatetime : 0 ,
455
+ columnTypeReal : 0 ,
456
+ columnTypeInteger : 0 ,
457
+ }
458
+
459
+ nonEmptyCount := 0
407
460
408
- for _ , value := range values {
461
+ for _ , value := range sampleValues {
409
462
// Skip empty values for type inference
410
463
value = strings .TrimSpace (value )
411
464
if value == "" {
412
465
continue
413
466
}
467
+ nonEmptyCount ++
414
468
415
- // Check if it's a datetime first (before checking numbers)
416
- if isDatetime (value ) {
417
- hasDatetime = true
418
- continue
469
+ // Determine the type of this value
470
+ valueType := classifyValue (value )
471
+ typeCounts [valueType ]++
472
+
473
+ // Early termination: if too many text values, it's definitely text
474
+ if typeCounts [columnTypeText ] > 0 && float64 (typeCounts [columnTypeText ])/ float64 (nonEmptyCount ) > EarlyTerminationThreshold {
475
+ return columnTypeText
419
476
}
477
+ }
420
478
421
- // Try to parse as integer
422
- if _ , err := strconv .ParseInt (value , 10 , 64 ); err == nil {
423
- hasInteger = true
424
- continue
479
+ if nonEmptyCount == 0 {
480
+ return columnTypeText
481
+ }
482
+
483
+ // Determine the most appropriate type based on confidence thresholds
484
+ return selectColumnType (typeCounts , nonEmptyCount )
485
+ }
486
+
487
+ // getSampleValues returns a sample of values for type inference to improve performance
488
+ // Uses stratified sampling to ensure better representation across the dataset
489
+ func getSampleValues (values []string ) []string {
490
+ if len (values ) <= MaxSampleSize {
491
+ return values
492
+ }
493
+
494
+ sampleSize := MaxSampleSize
495
+ samples := make ([]string , 0 , sampleSize )
496
+
497
+ // For very small datasets relative to sample size, fall back to simple sampling
498
+ if len (values ) < sampleSize * SamplingStratificationFactor {
499
+ step := max (1 , len (values )/ sampleSize )
500
+ for i := 0 ; i < len (values ) && len (samples ) < sampleSize ; i += step {
501
+ samples = append (samples , values [i ])
425
502
}
503
+ return samples
504
+ }
426
505
427
- // Try to parse as float
428
- if _ , err := strconv .ParseFloat (value , 64 ); err == nil {
429
- hasReal = true
430
- continue
506
+ // Stratified sampling: divide into 3 sections for better representation
507
+ sectionSize := len (values ) / SamplingStratificationFactor
508
+ if sectionSize == 0 {
509
+ // If section size is 0, fall back to simple sampling
510
+ step := max (1 , len (values )/ sampleSize )
511
+ for i := 0 ; i < len (values ) && len (samples ) < sampleSize ; i += step {
512
+ samples = append (samples , values [i ])
431
513
}
514
+ return samples
515
+ }
516
+
517
+ samplesPerSection := sampleSize / SamplingStratificationFactor
518
+ remainder := sampleSize % SamplingStratificationFactor
432
519
433
- // If it's not a number or datetime, it's text
434
- hasText = true
435
- break // If any value is text, the whole column is text
520
+ // Ensure each section gets at least one sample if possible
521
+ if samplesPerSection == 0 {
522
+ samplesPerSection = 1
523
+ remainder = max (0 , sampleSize - SamplingStratificationFactor )
436
524
}
437
525
438
- // Determine the most appropriate type
439
- // Priority: TEXT > DATETIME > REAL > INTEGER
440
- if hasText {
526
+ // Sample from beginning section with bounds checking
527
+ beginSamples := samplesPerSection
528
+ if remainder > 0 {
529
+ beginSamples ++
530
+ remainder --
531
+ }
532
+ if beginSamples > 0 {
533
+ step := max (1 , sectionSize / beginSamples )
534
+ for i := 0 ; i < sectionSize && len (samples ) < beginSamples && i < len (values ); i += step {
535
+ samples = append (samples , values [i ])
536
+ }
537
+ }
538
+
539
+ // Sample from middle section with bounds checking
540
+ middleSamples := samplesPerSection
541
+ if remainder > 0 {
542
+ middleSamples ++
543
+ }
544
+ if middleSamples > 0 {
545
+ startMiddle := sectionSize
546
+ step := max (1 , sectionSize / middleSamples )
547
+ targetSamples := len (samples ) + middleSamples
548
+ for i := 0 ; i < sectionSize && len (samples ) < targetSamples ; i += step {
549
+ idx := startMiddle + i
550
+ if idx < len (values ) {
551
+ samples = append (samples , values [idx ])
552
+ }
553
+ }
554
+ }
555
+
556
+ // Sample from end section with bounds checking
557
+ endSamples := sampleSize - len (samples )
558
+ if endSamples > 0 {
559
+ startEnd := 2 * sectionSize
560
+ if startEnd < len (values ) {
561
+ endSectionSize := len (values ) - startEnd
562
+ step := max (1 , endSectionSize / endSamples )
563
+ for i := 0 ; i < endSectionSize && len (samples ) < sampleSize ; i += step {
564
+ idx := startEnd + i
565
+ if idx < len (values ) {
566
+ samples = append (samples , values [idx ])
567
+ }
568
+ }
569
+ }
570
+ }
571
+
572
+ return samples
573
+ }
574
+
575
+ // classifyValue determines the type of a single value
576
+ func classifyValue (value string ) columnType {
577
+ // Check if it's a datetime first (before checking numbers)
578
+ if isDatetime (value ) {
579
+ return columnTypeDatetime
580
+ }
581
+
582
+ // Check for integer first to avoid redundant parsing
583
+ if isInteger (value ) {
584
+ return columnTypeInteger
585
+ }
586
+
587
+ // Then check for float (covers non-integer numbers)
588
+ if isFloat (value ) {
589
+ return columnTypeReal
590
+ }
591
+
592
+ return columnTypeText
593
+ }
594
+
595
+ // isInteger checks if a value is an integer with optimized parsing
596
+ func isInteger (value string ) bool {
597
+ // Quick pre-check: must start with digit or sign
598
+ if len (value ) == 0 {
599
+ return false
600
+ }
601
+ first := value [0 ]
602
+ if first != '+' && first != '-' && (first < '0' || first > '9' ) {
603
+ return false
604
+ }
605
+
606
+ _ , err := strconv .ParseInt (value , 10 , 64 )
607
+ return err == nil
608
+ }
609
+
610
+ // isFloat checks if a value is a float with optimized parsing
611
+ func isFloat (value string ) bool {
612
+ // Quick pre-check: must contain digits
613
+ hasDigit := false
614
+ for _ , r := range value {
615
+ if r >= '0' && r <= '9' {
616
+ hasDigit = true
617
+ break
618
+ }
619
+ }
620
+ if ! hasDigit {
621
+ return false
622
+ }
623
+
624
+ _ , err := strconv .ParseFloat (value , 64 )
625
+ return err == nil
626
+ }
627
+
628
+ // selectColumnType selects the best column type based on confidence analysis
629
+ func selectColumnType (typeCounts map [columnType ]int , totalCount int ) columnType {
630
+ // If any text values exist with reasonable confidence, choose text
631
+ if typeCounts [columnTypeText ] > 0 {
441
632
return columnTypeText
442
633
}
443
- if hasDatetime {
634
+
635
+ // Calculate confidence for each type
636
+ datetimeConfidence := float64 (typeCounts [columnTypeDatetime ]) / float64 (totalCount )
637
+ realConfidence := float64 (typeCounts [columnTypeReal ]) / float64 (totalCount )
638
+ integerConfidence := float64 (typeCounts [columnTypeInteger ]) / float64 (totalCount )
639
+
640
+ // Choose type with highest confidence above threshold
641
+ if datetimeConfidence >= MinConfidenceThreshold {
444
642
return columnTypeDatetime
445
643
}
446
- if hasReal {
644
+ // For mixed numeric types, prefer REAL if there are significant real values
645
+ // Only classify as REAL if real values make up a reasonable portion
646
+ if realConfidence >= MinRealThreshold && (realConfidence + integerConfidence ) >= MinConfidenceThreshold {
647
+ return columnTypeReal
648
+ }
649
+
650
+ if integerConfidence >= MinConfidenceThreshold {
651
+ return columnTypeInteger
652
+ }
653
+
654
+ // If no type has sufficient confidence, choose the most appropriate numeric type
655
+ if realConfidence > 0 {
447
656
return columnTypeReal
448
657
}
449
- if hasInteger {
658
+ if integerConfidence > 0 {
450
659
return columnTypeInteger
451
660
}
661
+ if datetimeConfidence > 0 {
662
+ return columnTypeDatetime
663
+ }
452
664
453
- // Default to TEXT if no values were found
665
+ // Default to text if nothing else matches
454
666
return columnTypeText
455
667
}
456
668
0 commit comments