In [1]:
import pandas as pd
import re
import numpy as np
import csv
import spacy
import pickle


This script is a rule based classfier to detect quotes and lyrics based on the names and content of the website, the evaluation of the classifer is in MLClassifier 

In [232]:
file = pd.read_csv('./Data/all.csv')
file[0:3]

Unnamed: 0,text,link,name,description
0,A morning filled with the goodness of God's pe...,https://www.pinterest.co.uk/pin/12434163968566...,May your day be filled with the blessings of l...,Wishing you a day filled with the deepest bles...
1,A morning filled with the goodness of God's pe...,https://www.pinterest.co.uk/barbarajstone3/mor...,386 best Morning Coffee with God! images on Pi...,"Sep 28, 2018- Explore Barbara Stone's board ""M..."
2,A morning filled with the goodness of God's pe...,http://www.quotegarden.com/coffee.html,"Coffee Quotes, Sayings about Caffeine - The Qu...",16 Feb 2018 - The morning cup of coffee has an...


In [233]:
print(len(file))

39041


preprocess data: the preprocess here is just lower case 

In [253]:
#tokenize
def preprocess(sent):
    words = str(sent).lower().split()
    new_words = []
    for w in words:
        w = re.sub(r'[0-9]+', '', w)
        new_words.append(w)
        
    return ' '.join(new_words)


In [None]:
#load spacy model
nlp = spacy.load('en')

1. Here we create a class to store each variable as an object 
2. Return the count of key words 'lyric, lyrics, quote, quotes' in the name of the website, because the name of the website contains most of the information we need
3. Count the keywords and store it as Score. Score is a vector that contain keyword counts in each search result
4. compute cosine similarity between post and retrieve website content

In [239]:
class myQuote:
    def __init__(self, text):
        #read in CSV, process
        self.quoteText = text
        self.quoteID = hash(self.quoteText)
        self.link = []
        self.name = []
        self.description = []
        self.scores = []
        self.cos = []
    
    def __hash__(self):
        return self.quoteID
    
    def __str__(self):
        return "Object text: " + self.quoteText + '\n' +\
        "Links: " + str(self.link) + '\n' +\
        "Names: " + str(self.name) + '\n' +\
        "Description " + str(self.description) + '\n' +\
        "Scores: " + str(self.scores)
        

objects = {}
with open('./Data/all.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        #print(row['text'])
        texthash = hash(row['text'])
        if texthash not in objects:
            objects[texthash] = myQuote(row['text'])
        objects[texthash].link.append(row['link'])
        objects[texthash].name.append(row['name'])
        objects[texthash].description.append(row['description'])
        line = preprocess(row['name'])
        #count keywords
        count = line.count("lyric") + line.count("lyrics") + line.count("quote") + line.count("quotes")
        objects[texthash].scores.append(count)
   
  
        #add extra fields in the class if I need to add more fields        
#print(len(objects.keys()))
#for item in objects.keys():
#    if sum(objects[item].scores) > 0:
#        print(objects[item])
        

    

In [255]:
#add cosine similarity as an object
#remove punctionations
def preprocess2(sent):
    #remove punctustion
    sent = re.sub(r'[^\w\s]','',sent)
    words = sent.split()
    new_words = []
    for w in words:      
        new_words.append(w.lower())
        
    return ' '.join(new_words)

s = '? And now I realise//I should\'ve kissed you in LA//But I drove home all alone//As if I had a choice anyway~ ?'
preprocess2(s)




'and now i realisei shouldve kissed you in labut i drove home all aloneas if i had a choice anyway'

In [None]:
def cosineSim(results):
    for item in results:
        str1 = ''.join (objects[item].description)
        doc2 = nlp(preprocess2(str1))
        doc1 = nlp(preprocess2(objects[item].quoteText))
        objects[item].cos.append(doc1.similarity(doc2))

cosineSim(results) 

In [206]:
#here we save the file as csv, the csv file is just for your reference
def saveAsCSV(dictz,filename):
    f = open(filename, 'w')
    writer = csv.writer(f, delimiter = ',',quoting=csv.QUOTE_MINIMAL)  
    writer.writerow(["hash"] + ["text"] + ["count"] + ["cosineSim"])
    for item in objects.keys():
        writer.writerow([objects[item].quoteID] + [objects[item].quoteText] + [objects[item].scores] + [objects[item].cos])
    f.close()
    
saveAsCSV(objects, 'myprocessedquotes2.csv')

1.  compute cosine similarity between posts and content (description) of the search result, those with 90% above similarity are labeled as quote. Those less than 92% are labeled as suspect.
2. cosine similarity >= 96%, label it as quote
3. cosine similarity > 92% but < 96% 
    1. Count keywords in search results (link names), if result contains keywords in  more than two document, label it as quote
    2. otherwise label it as suspect 

Model:
In the evaluation set
Among 200 posts: 24 suspect, 25 quotes
4 in 24 are actual quotes
2 among the rest is actual quote

Precision: 25/25 = 1
Recall: 25/31 = 0.81
F = 2(1*0.81/1+0.81) = 0.895

In [226]:
# use cosine similarity to compare text with search result
nlp = spacy.load('en')
def SearchResult2(results, filename):
    f = open(filename, 'w')
    writer = csv.writer(f, delimiter = ',',quoting=csv.QUOTE_MINIMAL)  
    writer.writerow(["hash"] + ["text"] + ["count"]+ ["cosineSim"] +['label'])
    for item in results:
        if objects[item].cos[0] < 0.92:
            writer.writerow([objects[item].quoteID] + [objects[item].quoteText] + [objects[item].scores] + [objects[item].cos[0]]+['NotQuote'])
        elif objects[item].cos[0] > 0.92 and objects[item].cos[0] < 0.96:
            count = 0
            for score in objects[item].scores:
                if score > 1: # 1 is 0 before we add 1 for smoothing
                    count = count + 1
            if count >= 2 :
                writer.writerow([objects[item].quoteID] + [objects[item].quoteText] + [objects[item].scores] + [objects[item].cos[0]]+['quote'])
            else:
                writer.writerow([objects[item].quoteID] + [objects[item].quoteText] + [objects[item].scores] + [objects[item].cos[0]]+['suspect'])
        else:
            writer.writerow([objects[item].quoteID] + [objects[item].quoteText] + [objects[item].scores] + [objects[item].cos[0]]+['quote'])
   # else:
       # writer.writerow([objects[item].quoteID] + [objects[item].quoteText] + [objects[item].scores] + ['null']+['NotQuote'])
    f.close()
        
results = objects.keys()
SearchResult2(results, 'QuotesDetected_all3.csv')

Preparing features for ML models:
Not all the queries have the same number of results, now we align the result vector as 10, which means we packed the result vector to 10 with 0. 10 is the maxium results in the entries. 
But before we algin the vector, we smooth the score results by adding 1, then the empty result is represented by 0 

In [271]:
def align_scores(objects):
    for item in objects:
        count = 0
        for i in range(len(objects[item].scores)): # add 1 to all the scores 
            objects[item].scores[i] = objects[item].scores[i] +1           
            if objects[item].scores[i] is not None:
                count = count + 1 
               # score = score + 1 won't work because score is a shallow copy
        #print(objects[item].scores)
        while count < 10:
            objects[item].scores.append(0) #empty result is replaced by 0
            count = count + 1

align_scores(results)

In [246]:
#now let's see the aligned vectors
count = 0
for item in results:
    print(objects[item].scores)
    count = count + 1
    if count > 3:
        break


[0.7931760405938567, 0.8392104624625699, 0.8795039632197936, 0.888068735773229, 0.892343830554656, 0.8931492717445956, 0.9015202272173928, 0.8983867934509157, 0.8956870011771089]
[0.9477222853011377, 0.9521458047100916, 0.9683161510072209, 0.9696274644433669, 0.9701125256993208, 0.971699710442211, 0.9724184061024539, 0.9674229419573442, 0.9670316158573239]
[0.9642679429616342, 0.9500276671597974, 0.9355095071756162, 0.9348746336805109, 0.9380975214108219, 0.9258982356962796, 0.9369188294523213, 0.9412716534469636, 0.9433237704986305]
[0.951290419812904, 0.9434646085106638, 0.9439693019234673, 0.9515647217395145, 0.9485525749001547, 0.9497238215336847, 0.9503412892497236, 0.9495074453693616, 0.9491427003586426, 0.9408861415235144]


In [None]:
# dump object to pickles
import pickle

filename = 'searchResults_Multicos'
outfile = open(filename,'wb')

pickle.dump(objects,outfile)
outfile.close()

In [222]:
#this function prints the result from the model and the algined vectors
def printFeatures(results, filename):
    SearchResult2(results, 'temp.csv')
        
printFeatures(results, 'features.csv')

In [257]:
###rehash object so that each post compare with each piece of text from the website
class myQuote:
    def __init__(self, text):
        #read in CSV, process
        self.quoteText = text
        self.quoteID = hash(self.quoteText)
        self.link = []
        self.name = []
        self.description = []
        self.scores = []
        self.cos = []
    
    def __hash__(self):
        return self.quoteID
    
    def __str__(self):
        return "Object text: " + self.quoteText + '\n' +\
        "Links: " + str(self.link) + '\n' +\
        "Names: " + str(self.name) + '\n' +\
        "Description " + str(self.description) + '\n' +\
        "Scores: " + str(self.scores)
        

objects2 = {}
with open('./Data/all.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        #print(row['text'])
        texthash = hash(row['text'])
        if texthash not in objects2:
            objects2[texthash] = myQuote(row['text'])
        objects2[texthash].link.append(row['link'])
        objects2[texthash].name.append(row['name'])
        objects2[texthash].description.append(row['description'])
        line = preprocess(row['name'])
        #count keywords
        count = line.count("lyric") + line.count("lyrics") + line.count("quote") + line.count("quotes")
        objects2[texthash].scores.append(count)
        str1 = ''.join (objects2[texthash].description)
        doc2 = nlp(preprocess2(str1))
        doc1 = nlp(preprocess2(objects2[texthash].quoteText))
        objects2[texthash].cos.append(doc1.similarity(doc2))
  
        #add extra fields in the class if I need to add more fields        
#print(len(objects.keys()))
#for item in objects.keys():
#    if sum(objects[item].scores) > 0:
#        print(objects[item])
        

    

In [272]:
#packed the cosine similarity space to 10
def align_cosSim(objects):
    for item in objects:
        count = 0
        for i in range(len(objects[item].cos)):           
            if objects[item].cos is not None:
                count = count + 1 
               # score = score + 1 won't work because score is a shallow copy
        #print(objects[item].scores)
        while count < 10:
            objects[item].cos.append(0) #empty result is replaced by 0
            count = count + 1
            
align_scores(objects2)
#align_cosSim(objects2)

In [273]:
#now let's see the aligned vectors
count = 0
for item in objects2:
    print(objects2[item].scores, objects2[item].cos)
    count = count + 1
    if count > 3:
        break

[1, 1, 5, 7, 5, 7, 1, 1, 6, 0] [0.8121557512130518, 0.8444628731297248, 0.883238498061439, 0.895255415804191, 0.8973190794659642, 0.9020793257695144, 0.9103472948899834, 0.9065849742168149, 0.9014258540138601, 0]
[5, 2, 3, 1, 1, 1, 1, 1, 3, 0] [0.9661760927270792, 0.9684345513806265, 0.9793349898739949, 0.9772083674557045, 0.9790977629789119, 0.9809415528730745, 0.981144944743423, 0.9778870115542392, 0.9769993154201844, 0]
[5, 7, 3, 5, 9, 9, 5, 1, 1, 0] [0.9697538260187835, 0.968014513297105, 0.9508086657467949, 0.9540593281024805, 0.9596373683815872, 0.9656340173608626, 0.9688343753683547, 0.971085906620613, 0.9686464488537065, 0]
[1, 1, 1, 1, 3, 1, 1, 1, 1, 3] [0.9741191505979864, 0.9702187708300353, 0.9585112444665423, 0.9671731587604064, 0.9668505532633145, 0.9619517474723219, 0.9628368766147921, 0.9623696406210958, 0.9611158616321533, 0.9562258392623101]


In [274]:
# dump object to pickles
filename = 'searchResults_Multicos'
outfile = open(filename,'wb')

pickle.dump(objects2,outfile)
outfile.close()