In [2]:
# import libraries and set randome seed for reproducibility
import pandas as pd
import numpy as np
import sklearn
import scipy
from sklearn import *
import os
import pickle  # to save model
from utils import *

RANDOM_SEED = 123  # taken from task description
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [5]:
# use this to train and VALIDATE your solution
train_df = pd.read_csv("./quora_train_data.csv")

# use this to provide the expected generalization results
test_df = pd.read_csv("./quora_test_data.csv")
test_df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,123842,200121,106133,What is the Burj Khalifa damper system?,What are some mind blowing unseen images of Bu...,0
1,385214,517332,205076,How can one open a demat account?,How do I open a demat account?,1
2,291190,412570,412571,In what ways would you want to contribute some...,Is it pragmatic to wish for a world free of vi...,0
3,319338,329701,157107,What are the best earphones within Rs 3000?,Which is the best earphone under Rs 3000?,1
4,377054,290027,508282,Which book is the best for GRE preparation?,Which are the best books for the IELTS and the...,0
...,...,...,...,...,...,...
80853,376451,507598,507599,Are A grades enough to get into Cambridge?,How do I get into Cambridge as a math major?,0
80854,332528,459522,459523,"As of July 2015, how is Quip doing?",How exactly does Quip work in layman's term?,0
80855,147020,232119,17978,What are some things new employees should know...,What are some things new employees should know...,0
80856,29775,55057,55058,Which is best ISP in indore?,Which is the best ISP in Chennai?,0


In [23]:
# extract questions (documents) and cast to strings
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
all_questions = q1_train + q2_train

count_vectorizer_v1 = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer_v1.fit(all_questions)
a = q1_train[0].lower().split()
b = q2_train[0].lower().split()


<8x78275 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [27]:
X_q1 = count_vectorizer_v1.transform([q1_train[0]])
X_q1.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
list(set(a)&set(b))

['get', 'bored', 'with', 'i', 'do', 'why']

In [20]:
diff = abs(len(a) - len(b))
diff

3

In [29]:
def jaccard_similarity(a, b):
    intersection = set(a).intersection(set(b))
    union = set(a).union(set(b))
    return len(intersection)/len(union)

In [30]:
jaccard_similarity(a,b)

0.46153846153846156

In [34]:
def term_frequency(sentence, ignore_tokens=["<SOS>","<EOS>"], lower_case = False):
    """
    Computes the term frequency for a single sentence. Used as auxiliary for other methods.
    Arguments:
    ----------
    document: list of string.
        The "document" to compute the global term frequency.
    ignore_tokens: list of str.
        Tokens to ignore in the term frequency computation.
    lower_case: boolean.
        Whether to be case sensitive or not. Defaults to False (case sensitive).
    """
    word_dict = {}
    # Simple preprocessing step
    words = [token.lower() if lower_case else token for token in sentence.split() if token not in ignore_tokens]
    for word in words:
        word_dict[word] = word_dict.get(word, 0)+1
    return word_dict

import numpy as np
import math

class TFIDF:
    def __init__(self, ignore_tokens=["<SOS>","<EOS>"], ignore_punctuation=True, lower_case = False):
        self.ignore_tokens = ignore_tokens
        if ignore_punctuation:
            self.ignore_tokens += [char for char in string.punctuation]
        self.lower_case = lower_case
        self.word_indexes = {}
        self.index_to_word = {}
        # idf_dict will store D/freq(word) for each word. We'll use later for the end computation.
        self.idf_dict = {}
        self.num_documents = 0

    def fit(self, data):
        """
        Fits the data into the Featurizer.
        Arguments
        ---------
        data: list of string.
            The data to fit the featurizer.
        Exceptions
        ----------
        TypeError
            Related to data type. Expects list of strings.
        """
        self.num_documents = len(data)
        #compute global term frequency by iterating over all sentences and counting all words.
        global_term_freq = {}
        list_of_sentences = data
        for sentence in list_of_sentences:
            words_in_sent = set()
            document_frequency = term_frequency(sentence, self.ignore_tokens, self.lower_case)
            for word in document_frequency:
                if word not in words_in_sent:
                    global_term_freq[word] = global_term_freq.get(word, 0)+1
                    words_in_sent.add(word)
        # Compute idf for each word by finding the log of total num of docs divided by the total number of times a word appears in at least one document.
        # Add 1 to numerator and denominator to avoid division by zero.
        for word, frequency in global_term_freq.items():
            idf = math.log(float(1 + self.num_documents) / (1 + frequency))
            self.idf_dict[word]=idf
        # Initialize the indexer
        document_words = list(global_term_freq.keys())
        for word_position in range(len(document_words)):
            word = document_words[word_position]
            self.word_indexes[word] = word_position
            self.index_to_word[word_position] = word

    def transform(self, data):
        """
        Transforms the data passed as input into a tdf-idf vector/matrix, depending on the input.
        Arguments
        ---------
        data: list of string or string.
            The data to fit the featurizer.
        AttributeError
            Related to the vocabulary lenght. Happens if fit with empty data or not fit.
        """
        if isinstance(data, list):
            return self._transform_document(data)
        elif isinstance(data, str):
            return self._transform_sentence(data)

    def _transform_document(self, data):
        """ This method is just used for simple batch transforming. """
        to_transform = data
        sentence_arrays = []
        for sentence in data:
            sentence_arrays.append(self._transform_sentence(sentence))
        return np.matrix(sentence_arrays)

    def _transform_sentence(self, data):
        tokens = [token.lower() if self.lower_case else token for token in data.split()]
        # Initializes array with the size of vocabulary.
        word_array = np.zeros(len(self.word_indexes))
        sentence_tf_idf = self._compute_sentence_tf_idf(data)
        # Runs over every token in sentence
        for token in tokens:
            if token in self.word_indexes:
                token_index = self.word_indexes[token]
                # Add the tfidf value for each token in sentence to its position in vocabulary array.
                word_array[token_index] = sentence_tf_idf[token]
        return word_array

    def _compute_sentence_tf_idf(self, sentence):
        """
        Computes the tf_idf for a single sentence(document).
        """
        sentence_tf_idf = {}
        # Gets the document frequency by using the helper method
        document_frequency = term_frequency(sentence, self.ignore_tokens, self.lower_case)
        # Gets the total number of words in sentence
        total_words = sum(document_frequency.values())
        # Find individual term frequency value averaged by total number of words.
        averaged_frequency = {k:(float(v)/total_words) for k,v in document_frequency.items()}
        for term, tf in averaged_frequency.items():
            # Out of vocabulary words are simply zeroed. They are going to be removed later either way.
            # Computes the tfidf for each word by using word tf times the term idf
            sentence_tf_idf[term] = tf*self.idf_dict.get(term, 0)
        return sentence_tf_idf

In [38]:
sentences = ['this is a list of sentences', 'second sentence in list of sentences', 'a word for complexity']
featurizer = TFIDF()
featurizer.fit(all_questions)


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
q1_tfidf = featurizer.transform(q1_train)
q2_tfidf = featurizer.transform(q2_train)