Skip to content

Commit

Permalink
Reindended and handled old ruby syntax
Browse files Browse the repository at this point in the history
  • Loading branch information
mikel committed Feb 7, 2010
1 parent 16b6d42 commit 1bf79f1
Show file tree
Hide file tree
Showing 16 changed files with 443 additions and 446 deletions.
Expand Up @@ -6,7 +6,8 @@
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Contributor(s)

# Jeff Hodges
# Mark Pilgrim - port to Python
#
Expand Down Expand Up @@ -49,34 +50,34 @@ def reset
def feed(aStr, aCharLen)
# # """feed a character with known length"""
if aCharLen == 2
# we only care about 2-bytes character in our distribution analysis
order = get_order(aStr)
# we only care about 2-bytes character in our distribution analysis
order = get_order(aStr)
else
order = -1
order = -1
end
if order >= 0
@_mTotalChars += 1
# order is valid
if order < @_mTableSize:
if 512 > @_mCharToFreqOrder[order]:
@_mFreqChars += 1
end
end
@_mTotalChars += 1
# order is valid
if order < @_mTableSize
if 512 > @_mCharToFreqOrder[order]
@_mFreqChars += 1
end
end
end
end

def get_confidence
# """return confidence based on existing data"""
# if we didn't receive any character in our consideration range, return negative answer
if @_mTotalChars <= 0
return SURE_NO
return SURE_NO
end

if @_mTotalChars != @_mFreqChars:
r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
if r < SURE_YES
return r
end
if @_mTotalChars != @_mFreqChars
r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
if r < SURE_YES
return r
end
end

# normalize confidence (we don't want to be 100% sure)
Expand Down Expand Up @@ -111,9 +112,9 @@ def get_order(aStr)
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0..0] >= "\xC4"
return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
return 94 * (aStr[0] - 0xC4) + aStr[1] - 0xA1
else
return -1
return -1
end
end
end
Expand All @@ -132,9 +133,9 @@ def get_order(aStr)
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0..0] >= "\xB0"
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
else
return -1
return -1
end
end
end
Expand All @@ -153,9 +154,9 @@ def get_order(aStr)
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if (aStr[0..0] >= "\xB0") and (aStr[1..1] >= "\xA1")
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
return 94 * (aStr[0] - 0xB0) + aStr[1] - 0xA1
else
return -1
return -1
end
end
end
Expand All @@ -174,13 +175,13 @@ def get_order(aStr)
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0..0] >= "\xA4"
if aStr[1..1] >= "\xA1"
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
else
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
end
if aStr[1..1] >= "\xA1"
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0xA1 + 63
else
return 157 * (aStr[0] - 0xA4) + aStr[1] - 0x40
end
else
return -1
return -1
end
end
end
Expand All @@ -200,15 +201,15 @@ def get_order(aStr)
# no validation needed here. State machine has done that
aStr = aStr[0..1].join if aStr.class == Array
if (aStr[0..0] >= "\x81") and (aStr[0..0] <= "\x9F")
order = 188 * (aStr[0] - 0x81)
order = 188 * (aStr[0] - 0x81)
elsif (aStr[0..0] >= "\xE0") and (aStr[0..0] <= "\xEF")
order = 188 * (aStr[0] - 0xE0 + 31)
order = 188 * (aStr[0] - 0xE0 + 31)
else
return -1
return -1
end
order = order + aStr[1] - 0x40
if aStr[1..1] > "\x7F"
order =- 1
order =- 1
end
return order
end
Expand All @@ -227,10 +228,10 @@ def get_order(aStr)
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0..0] >= "\xA0":
return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
if aStr[0..0] >= "\xA0"
return 94 * (aStr[0] - 0xA1) + aStr[1] - 0xa1
else
return -1
return -1
end
end
end
Expand Down
Expand Up @@ -6,7 +6,7 @@
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Contributor(s)
# Jeff Hodges - port to Ruby
# Mark Pilgrim - port to Python
#
Expand Down Expand Up @@ -40,71 +40,71 @@ def reset
super
@_mActiveNum = 0

for prober in @_mProbers:
if prober
prober.reset()
prober.active = true
@_mActiveNum += 1
end
for prober in @_mProbers
if prober
prober.reset()
prober.active = true
@_mActiveNum += 1
end
end
@_mBestGuessProber = nil
end

def get_charset_name
if not @_mBestGuessProber
get_confidence()
return nil unless @_mBestGuessProber
# self._mBestGuessProber = self._mProbers[0]
get_confidence()
return nil unless @_mBestGuessProber
# self._mBestGuessProber = self._mProbers[0]
end
return @_mBestGuessProber.get_charset_name()
end

def feed(aBuf)
for prober in @_mProbers
next unless prober
next unless prober.active
st = prober.feed(aBuf)
next unless st
if st == EFoundIt
@_mBestGuessProber = prober
return get_state()
elsif st == ENotMe
prober.active = false
@_mActiveNum -= 1
if @_mActiveNum <= 0
@_mState = ENotMe
return get_state()
end
end
next unless prober
next unless prober.active
st = prober.feed(aBuf)
next unless st
if st == EFoundIt
@_mBestGuessProber = prober
return get_state()
elsif st == ENotMe
prober.active = false
@_mActiveNum -= 1
if @_mActiveNum <= 0
@_mState = ENotMe
return get_state()
end
end
end
return get_state()
end

def get_confidence()
st = get_state()
if st == EFoundIt
return 0.99
return 0.99
elsif st == ENotMe
return 0.01
return 0.01
end
bestConf = 0.0
@_mBestGuessProber = nil
for prober in @_mProbers
next unless prober
unless prober.active
$stderr << "#{prober.get_charset_name()} not active\n" if $debug
next
end
cf = prober.get_confidence()
$stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
if bestConf < cf
bestConf = cf
@_mBestGuessProber = prober
end
next unless prober
unless prober.active
$stderr << "#{prober.get_charset_name()} not active\n" if $debug
next
end
cf = prober.get_confidence()
$stderr << "#{prober.get_charset_name} confidence = #{cf}\n" if $debug
if bestConf < cf
bestConf = cf
@_mBestGuessProber = prober
end
end
return 0.0 unless @_mBestGuessProber
return bestConf
# else:
# else
# self._mBestGuessProber = self._mProbers[0]
# return self._mBestGuessProber.get_confidence()
end
Expand Down
Expand Up @@ -44,8 +44,8 @@ def next_state(c)
# if it is first byte, we also get byte length
byteCls = @_mModel['classTable'][c[0]]
if @_mCurrentState == EStart
@_mCurrentBytePos = 0
@_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
@_mCurrentBytePos = 0
@_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
end
# from byte's class and stateTable, we get its next state
@_mCurrentState = @_mModel['stateTable'][@_mCurrentState * @_mModel['classFactor'] + byteCls]
Expand Down
Expand Up @@ -30,21 +30,19 @@ module CharDet
class EscCharSetProber < CharSetProber
def initialize
super()
@_mCodingSM = [
CodingStateMachine.new(HZSMModel),
CodingStateMachine.new(ISO2022CNSMModel),
CodingStateMachine.new(ISO2022JPSMModel),
CodingStateMachine.new(ISO2022KRSMModel)
]
@_mCodingSM = [ CodingStateMachine.new(HZSMModel),
CodingStateMachine.new(ISO2022CNSMModel),
CodingStateMachine.new(ISO2022JPSMModel),
CodingStateMachine.new(ISO2022KRSMModel) ]
reset()
end

def reset
super()
for codingSM in @_mCodingSM:
next if not codingSM
codingSM.active = true
codingSM.reset()
for codingSM in @_mCodingSM
next if not codingSM
codingSM.active = true
codingSM.reset()
end
@_mActiveSM = @_mCodingSM.length
@_mDetectedCharset = nil
Expand All @@ -56,35 +54,36 @@ def get_charset_name

def get_confidence
if @_mDetectedCharset
return 0.99
return 0.99
else
return 0.00
return 0.00
end
end

def feed(aBuf)
aBuf.each_byte do |b|
c = b.chr
for codingSM in @_mCodingSM
next unless codingSM
next unless codingSM.active
codingState = codingSM.next_state(c)
if codingState == EError
codingSM.active = false
@_mActiveSM -= 1
if @_mActiveSM <= 0
@_mState = ENotMe
return get_state()
end
elsif codingState == EItsMe
@_mState = EFoundIt
@_mDetectedCharset = codingSM.get_coding_state_machine()
return get_state()
end
end
c = b.chr
for codingSM in @_mCodingSM
next unless codingSM
next unless codingSM.active
codingState = codingSM.next_state(c)
if codingState == EError
codingSM.active = false
@_mActiveSM -= 1
if @_mActiveSM <= 0
@_mState = ENotMe
return get_state()
end
elsif codingState == EItsMe
@_mState = EFoundIt
@_mDetectedCharset = codingSM.get_coding_state_machine()
return get_state()
end
end
end

return get_state()

end

end
end

0 comments on commit 1bf79f1

Please sign in to comment.