In [1]:
import nltk
from nltk.corpus import stopwords
import string
import hashlib
import random
import numpy as np

FILE1 = "file1.txt"
FILE2 = "file2.txt"

In [2]:
def get_shingles(words, shingle_size):
    shingles = []
    for i in range(len(words) - shingle_size + 1):
        shingle_start = i
        shingle_end = i + shingle_size
        shingles.append(" ".join(words[shingle_start:shingle_end]))
    return shingles

print get_shingles(["1", "2", "3", "4"], 3)

['1 2 3', '2 3 4']


In [3]:
def tokenize_file(filename):
    with open(filename, "r") as f:
        lines_without_newline = [line.decode('utf-8').strip() for line in f.readlines()]
        complete_text = " ".join(lines_without_newline)
        tokens = nltk.word_tokenize(complete_text)
        #word_tokens = [token for token in tokens if token.isalnum() and not token in stopwords.words("english")]
        word_tokens = [token.lower() for token in tokens if token.isalnum()]
        return word_tokens
        
tokenize_file(FILE1)[:20]

[u'text',
 u'mining',
 u'also',
 u'referred',
 u'to',
 u'as',
 u'text',
 u'data',
 u'mining',
 u'roughly',
 u'equivalent',
 u'to',
 u'text',
 u'analytics',
 u'is',
 u'the',
 u'process',
 u'of',
 u'deriving',
 u'information']

In [4]:
def compute_hash(word):
    m = hashlib.md5()
    m.update(word)
    return m.hexdigest()

compute_hash("one")

'f97c5d29941bfb1b2fdab0874906ab82'

In [5]:
def apply_w_shingling(file1, file2):
    shingles1 = get_shingles(tokenize_file(file1), 4)
    hashes1 = [compute_hash(shingle) for shingle in shingles1]
    hashset1 = set(hashes1)
    
    shingles2 = get_shingles(tokenize_file(file2), 4)
    hashes2 = [compute_hash(shingle) for shingle in shingles2]
    hashset2 = set(hashes2)
    
    hits = 0
    misses = 0
    sample_idxs = np.random.choice(range(len(hashes1) + len(hashes2)), 1000)
    for rand_idx in sample_idxs:     
        if rand_idx < len(hashes1):
            sampled_hash = hashes1[rand_idx]
        else:
            sampled_hash = hashes2[rand_idx - len(hashes1)]
        if sampled_hash in hashset1 and sampled_hash in hashset2:
            hits += 1
            if rand_idx < len(shingles1):
                print shingles1[rand_idx]
            else:
                print shingles2[rand_idx - len(shingles1)]
        else:
            misses += 1
            
    return float(hits)/(hits + misses)
    
apply_w_shingling(FILE1, FILE2)

0.0

In [6]:
apply_w_shingling("plag1.txt", "plag2.txt")

the folly of excessive
on to protect himself
show similarly when he
nature of the choice
folly of excessive melodramatic
is acting out for
side the pictures of
up side by side
out the heart of
of the two kings
pretense of madness the
and proceeds to describe
laertes and perhaps for
kings old hamlet and
when hamlet enters his
he is acting out
claudius and proceeds to
leaps into the open
into the open grave
the choice she has
funeral ranting in high
and prevent his antagonists
antagonists from plucking out
for himself as well
made presenting truth by
on to protect himself
out the heart of
plucking out the heart
of the choice she
acting out for laertes
prevent his antagonists from
to protect himself and
himself as well the
two kings old hamlet
leaps into the open
antagonists from plucking out
heroic terms he is
on to protect himself
grave at funeral ranting
pretense of madness the
when hamlet enters his
himself and prevent his
protect himself and prevent
choice she has made
and prevent hi

0.713

In [7]:
apply_w_shingling("trump.txt", "obama.txt")

you work hard for
in life that your
your bond and you
willingness to work for
fell in love with
because we want our
all the way to
thank you god bless
that you work hard
and your willingness to
that your word is
willingness to work for
life that your word
your willingness to work
your achievements is the
your bond and you
word is your bond
and god bless america
in this nation to
dreams and your willingness
that you treat people
your word is your
and god bless america
hard for what you
your word is your
that you treat people
do what you say
that your word is
fell in love with
you do what you
to work for them
to work for them
that the only limit
of the united states
your word is your
that the only limit
this country so much
of the united states
to know that the
of the united states
thank you god bless
and you do what
you want in life
work hard for what


0.044

In [12]:
apply_w_shingling("trump.txt", "bush.txt")

god bless you and
am so proud of
am so proud of
i am so proud


0.004

In [8]:
apply_w_shingling("clinton.txt", "obama.txt")

men and women who
president of the united
in the united states
to the white house


0.004

In [9]:
apply_w_shingling("obama.txt", "bush.txt")

in the united states
united states of america
and god bless america
you and god bless
bless you and god
president of the united
the united states of
president of the united
united states of america


0.009

In [10]:
apply_w_shingling("text-mining-wikipedia.txt", "text-mining-berklee.txt")

is the use of
in order to find


0.002

In [11]:
apply_w_shingling("text-mining-isr.txt", "text-mining-berklee.txt")

what is text mining


0.001

In [13]:
apply_w_shingling("poem1.txt", "poem2.txt")

more supple than the
to god we never
they must have been
have been closer than
out to where the
rose for water in
did more supple than
stepped between the trees
those ribs that ragged
to where the forest
the forest lapped the
i rose for water
where the forest lapped
i rose for water
out to where the
back towards whatever followed
must have been closer
ones who stepped between
forest lapped the edge
in the middle of
that ragged fur their
the holidays and they
were brighter every time
we never saw the
looking out to where
and they must have
before because i have
no memory of those
closer than before because
ragged fur their eyes
who stepped between the
saw them stealing through
then five years on
those ribs that ragged
in the holidays and
the trees on coloured
saw the ones who
brought them up each
trees on coloured hooves
teatime in the holidays
forest lapped the edge
hooves i brought them
the same house i
up each teatime in
ones who stepped between
trees on coloured hooves
looking out t

0.692

In [14]:
apply_w_shingling("poem3.txt", "poem4.txt")

either in bed or
you see them home
smell of cooking they
hear their footsteps the
when you see them
see their faces too
see their faces too
to see their faces
the smell of cooking
they are too tired
smell of cooking they
home early from work
their faces too sad
they quicken their step
them home early from
the smell of cooking
smell of cooking they
home early from work
but when you see
to see their faces
you see them home
they quicken their step
see them home early
home early from work
at the smell of
them home early from
hear their footsteps the
smell of cooking they
at the smell of
to see their faces
at the smell of
hear their footsteps the
smell of cooking they
see their faces too
them home early from
the smell of cooking
but when you see
but when you see
they are too tired
see their faces too
this masculine invisibility makes
smell of cooking they
you see them home
see them home early
but when you see
the smell of cooking
this masculine invisibility makes
see their faces too
they qu

0.205

In [15]:
apply_w_shingling("poem1.txt", "poem4.txt")

0.0