In [85]:
import pickle
import pandas as pd

#Load the row paragraphs
with open('labeledData.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
#Load the labels
labels = pd.read_excel('labels.xls', sheet_name = None)

del labels['TEMPLATE SHEET']

In [113]:
for filename in labels.keys():
    print('Loading '+filename)
    if filename+'.md' in data.keys():
        print('---- '+filename+'.md'+" found in data source")
        for paraTuple in data[filename+'.md']:
            paraIndex = paraTuple[0]
            paragraph = paraTuple[1]
            paragraphProcesses = " ".join(list(filter(lambda a: a != '\n', paragraph)))        
            labels[filename].loc[paraIndex, 'PARAGRAPH'] = paragraphProcesses
        labels[filename]['PARAGRAPH'] = labels[filename]['PARAGRAPH'].fillna('')

Loading megaresistencia.com
---- megaresistencia.com.md found in data source
Loading themcrookedvultures.com
---- themcrookedvultures.com.md found in data source
Loading icanhascheezburger.com
---- icanhascheezburger.com.md found in data source
Loading perezitos.com
---- perezitos.com.md found in data source
Loading statdx.com
---- statdx.com.md found in data source


In [132]:
with open('DB.pickle', 'wb') as handle:
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [87]:
example = "How HugeDomains Collects this Information HugeDomains utilizes a variety of security measures to maintain the safety of customers' personal information. All supplied financial information (including credit card data) is transmitted via Secure Socket Layer (SSL) technology and then encrypted into the company's payment gateway provider's database. This database and the sensitive information contained within are only accessible by those authorized with special access rights to such systems and are required to keep the information confidential. "

In [97]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt') # if necessary...

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patrickzoechbauer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [130]:
def calc_bestfit(example, labels):
    bestfit = dict() 
    bestfit['similarity'] = 0
    for file in labels: 
        for para in range(len(labels[file])):
            similarity = cosine_sim(example, labels[file].loc[para,'PARAGRAPH'])        
            if bestfit['similarity'] < similarity:
                #new best fit
                bestfit['similarity'] = similarity
                bestfit['file'] = file
                bestfit['label'] = labels[file].loc[para, 'LABEL']
                bestfit['PARAGRAPH'] = labels[file].loc[para, 'PARAGRAPH']
    return bestfit