In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from collections import Counter
nltk.download('stopwords')
import numpy as np

stemmer = PorterStemmer()

# Tokenize, stem a document
def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(word.lower()) for word in tokens])


# compute IDF, storing idf values in a dictionary
def idf_values(vocabulary, documents):
    idf = {}
    num_documents = len(documents)
    for i, term in enumerate(vocabulary):
        idf[term] = math.log(num_documents/sum(term in document for document in documents), math.e)
    return idf

# Function to generate the vector for a document (with normalisation)
def vectorize(document, vocabulary, idf):
    vector = [0]*len(vocabulary)
    counts = Counter(document)
    max_count = counts.most_common(1)[0][1]
    for i,term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term]/max_count
    return vector

# Function to compute cosine similarity
def cosine_similarity(v1,v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
            result = 0
    else:
            result = sumxy/math.sqrt(sumxx*sumyy)
    return result

def vectorize_query(query, vocabulary, idf):
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf)
    return query_vector
    
def search_vec(query, k):
    query_vector = vectorize_query(query, vocabulary, idf)
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(documents))]
    scores.sort(key=lambda x: -x[0])
    ans = []
    indices = []
    for i in range(min(k,len(original_documents))):
        ans.append(original_documents[scores[i][1]])
        indices.append(scores[i][1])
    return ans, indices, query_vector

[nltk_data] Downloading package stopwords to /Users/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
websites_filtered = pd.read_csv('../Data/labelled.csv',index_col='displayLink')
coke_mdma = websites_filtered[(websites_filtered['first'].isin(['purchase'])) \
                              & ((websites_filtered['drug'].isin(['cocaine','mdma'])))]

In [6]:
coke_mdma.head()

Unnamed: 0_level_0,snippet,count,first,drug,label
displayLink,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
drugs-center.biz,"Buy drugs online from Drugs-Center, the bigges...",6316,purchase,cocaine,1.0
buycocaineforsale.com,Buy Bolivian Cocaine | Bolivian Cocaine for sale,2219,purchase,cocaine,2.0
www.rchemicals4us.com,MDMA Molly Online Buy Online| MDMA For Sale On...,1627,purchase,mdma,2.0
ps3.wonderhowto.com,"Oct 9, 2013 ... If you're trying to get rich q...",1416,purchase,cocaine,0.0
gta.wikia.com,"In order to get access, the player must first ...",644,purchase,cocaine,0.0


In [38]:
d = [str(x).strip() for x in coke_mdma.snippet.values]
d = [tokenize(x).split() for x in d]
vocabulary = set([item for sublist in d for item in sublist])
vocabulary = [word for word in vocabulary if word not in stopwords.words('english')]
idf = idf_values(vocabulary, d)
document_vectors = [vectorize(s, vocabulary, idf) for s in d]

In [39]:
np.shape(document_vectors)

(521, 2272)