# Youtube Conversation Search
## CS/INFO 4300 Language and Information

In [2]:
from __future__ import print_function
from __future__ import division
import numpy as np
import json

import os
import sys
import math

%matplotlib inline
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from nltk import word_tokenize, sent_tokenize
from data_collection.util import *

import string

## 1. Load the data from the JSON file (Lemmatize, Flatten).

In [3]:
#male siri

#Method to check for numbers
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [4]:
# Define a stemmer and lemmatizer for use with our captions
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
my_punct = set(string.punctuation)
data_path = "data/raw_comments_v2"

flat_comments_list = []
flat_comments_map = {} #idx to vid_id
idx = 0
for vid_id in get_filenames(data_path):
    video_data = json.load(open(data_path+"/"+vid_id+".json"))
    if video_data is not None and 'comments' in video_data and video_data['comments'] is not None:
        flat_comments = ""
        for comment in video_data['comments']:
            for word in word_tokenize(comment["text"]):
                if not word.startswith("+") and word not in my_punct and not has_numbers(word):
                    flat_comments += (lemmatizer.lemmatize(word.lower())+" ")
        if flat_comments != "":
            flat_comments_list.append(flat_comments[:-1]) #add to list
            flat_comments_map[idx] = vid_id               #add to subj
            idx += 1                                      #increment i

## 3. Build the document-term matrices

Use `sklearn.feature_extraction.TfidfVectorizer`. Use unigrams only, disable idf, use `l1` normalization. 

Resulting matrices are `X_train` and `X_test`.

**Note:** Remember to just `fit` on the training data. If a word occurs only in the test documents, our model should **not** be aware that the word exists, as we are trying to evaluate the performance on completely unseen data.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
tfv = TfidfVectorizer(ngram_range=(1,2), lowercase=True, strip_accents="unicode", 
                      stop_words='english', use_idf=True, norm='l1', min_df=2, max_df=.9)
comments_vectors = tfv.fit_transform(flat_comments_list)

In [7]:
print(comments_vectors.shape)

(5968, 740395)


### Save Data

In [13]:
save_data = False
if save_data:
    from sklearn.externals import joblib
    from scipy import sparse, io
    joblib.dump(tfv, r'trained_models/comments_tfv_v2.pkl')
    io.mmwrite("commets_vectors_v2.mtx", comments_vectors)

['trained_models/comments_tfv.pkl',
 'trained_models/comments_tfv.pkl_01.npy',
 'trained_models/comments_tfv.pkl_02.npy']

### Load Data

In [16]:
load_data = False
if load_data:
    comments_vectors = io.mmread("commets_vectors.mtx")
    comments_vectors = newm.tocsr()
    tfv = joblib.load('trained_models/comments_tfv_v2.pkl')

##Cosine Similarty Search

In [8]:
def cosine_similarity(X, Y):
    """Calculates cosine Similarity of two vectors
    !!!!MAKE SURE DOC IS SECOND ARGUMENT!!!!! (the Y)
    =Inputs=
    X : The question vectors (sparse or dense matrix)
    Y : The passage vectors (sparse or dense matrix)
    =Outputs=
     Consine Similarity (float)
    """
    def sparse_cosine_sim(X,Y):
        def my_dot(X,Y):
            dot_prod_arr = X.dot(Y.T).data
            return dot_prod_arr[0] if len(dot_prod_arr) > 0 else 0
        def my_norm(Z):
            return math.sqrt(my_dot(Z,Z))
        doc_norm = my_norm(Y)
        return my_dot(X,Y)/doc_norm if doc_norm !=0 else 0
    def dense_cosine_sim(X,Y):
        doc_norm = np.linalg.norm(Y)
        return np.dot(X,Y)/doc_norm if doc_norm !=0 else 0
    def is_array(X):
        return isinstance(X, list) or isinstance(X, (np.ndarray, np.generic) )
    return dense_cosine_sim(X,Y) if is_array(X) and is_array(Y) else sparse_cosine_sim(X,Y)

In [9]:
def search(query, doc_vectors, vectorizer, k=10, idx_map=None):
    """ Returns top k simialr docs to query
    =Inputs=
    query: string
        Input query to match with docs
    docs_vecotrs: vector list
        doc vectors to find similarity with query
    vectorizer: tfidfvectorizer
        the vectorizer used on the doc vectors
    k: int
        how many top docs to return
    idx_map: dict or None
        the map from idx to vid_id
    =Outputs=
    (score, vid_id) list
    """
    query_vector = vectorizer.transform([query])
    sorted_k_tuples = sorted([(cosine_similarity(query_vector, doc_vector), idx) for idx,doc_vector in enumerate(doc_vectors)], reverse=True)[:k]
    if idx_map is None:
        return sorted_k_tuples
    else:
        return [(sim, idx_map[doc_idx]) for sim, doc_idx in sorted_k_tuples]

In [30]:
search("thanks obama", comments_vectors, tfv, k=10, idx_map=flat_comments_map)

[(0.20982795772914034, 'ZyU213nhrh0'),
 (0.19621449999957852, '7eJpWOY3r18'),
 (0.16761168677345412, 'lce5gWKgMXI'),
 (0.16701550267293958, 'geyAFbSDPVk'),
 (0.15621381645353544, 'hX1YVzdnpEc'),
 (0.15454440509127793, 'ZJfUB_GRzEk'),
 (0.13923551930924907, '95KTrtzOY-g'),
 (0.13569190060186481, 'ej_H8wYo2s4'),
 (0.13129561210298837, 'ehYoIKTsiV0'),
 (0.12794194766063388, 'AijEQN6AuRs')]