In [2]:
"""
Quora pairs Kaggle competition
Creation of a dictionary
@author: Luis Duque
"""

import os
import time
import pandas as pd
import numpy as np
import csv
import string
import re, math
from string import punctuation
from difflib import SequenceMatcher
from collections import Counter
from operator import xor
from IPython.display import clear_output
import gensim
from gensim.models import Word2Vec ## We load a pretrained Word2Vect model
model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
################################################################################################################
############    CREATION OF THE DICTIONARY   ###################################################################
################################################################################################################
# The main function in this cell is CreateDictionary(Df1, Df2):
# The goal of this function is to find the relevance of each word in Df1 (the training data set)
## Very simple comparison between strings

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


# Note many of the "fuzzy" functions have similars already implemented in pythons fuzzywuzzy module   
def FuzzyWordInList(Word, List):
    if Word in List: 
        return 1
    
    for cWord in List:
        if FuzzySame(Word, cWord):
            return 1;
    return 0


def FuzzyFindRepresentative(Word, List): 
    if Word in List: 
        return Word
    else:
        return ""

    
def FuzzyIntersection(Bag1, Bag2):
    Intersection = []
    for Word in Bag1:
        if FuzzyWordInList(Word, Bag2):
            Intersection.append(Word)
    return Intersection


def FuzzyUnion(Bag1, Bag2):
    Set1 = set(Bag1)-set(FuzzyIntersection(Bag1, Bag2))
    Set2 = set(Bag2)
    return list(Set1.union(Set2))


def FuzzyWordsInOneList(Bag1, Bag2):
    Lista = []
    for Word in Bag1:
        if not FuzzyWordInList(Word, Bag2):
            Lista.append(Word)
        
    for Word in Bag2:
        if not FuzzyWordInList(Word, Bag1):
            Lista.append(Word)
    return Lista


def FuzzySame(Word1, Word2):   
    comparison = similar(Word1, Word2)
    if comparison > 0.8:
        return 1
    else: 
        return 0


def strip_punctuation_nums(s):
    s = s.lower()
    return ''.join(c for c in s if c not in punctuation+"â€™0123456789")


def StringToList_Nonums(Doc): ## Auxiliar function that converts a string of characters into a Bag of words
    nopunctuationlower = strip_punctuation_nums(Doc)
    tokens = nopunctuationlower.split()
    return tokens


def DictionaryToCSV(Dictionary, filename): # Saves the global hash table Dictionary to a .csv file
    Wordlist = []
    Quotient1list = []
    Quotient2list = []

    for word in Dictionary:
        Quotients = Dictionary[word]
        Wordlist.append(word)
        Quotient1list.append(Quotients[0])
        Quotient2list.append(Quotients[1])

    DictDf = pd.DataFrame(   ## This creates the data frame with the whole Dictionary
    {'Quotient1': Quotient1list,
    'Quotient2': Quotient2list,
     'Words': Wordlist
    })

    DictDf.to_csv(filename, header = True)  ## This creates the csv file of the dictionary
    return


def RepeatedWordsToCSV(RepeatedWords, filename): # Saves the global hash table RepeatedWords to a .csv file  (this is just for exploratory purposes)
    ## We create a file where we list words that are similar (Just to have an idea of what is going on)
    Wordlist = []
    SimilarWords  = []
    for word in RepeatedWords:
        Wordlist.append(word)
        SimilarWords.append(RepeatedWords[word])

    SimilarDf = pd.DataFrame(   ## This creates the data frame listing related words
    {'Similar': SimilarWords,
    'Words': Wordlist
    })

    SimilarDf.to_csv(filename, header = True)  ## This creates the csv file of Similar words
    return 


# The following function use word2vect pretrained from google. Given a Dictionary this will extend it using word2vect.
# EXAMPLE: say we run PossibleNewWord("colombia") and "colombia" is not in the Dictionary. This function will find the 
# word in our dictionary that is closest to "colombia" using word2vect (say the closest one is "india") and 
# assign Dictionary2["colombia"] = Dictionary["india"]. 
def PossibleNewWord(myWord, Dictionary, Dictionary2, RepeatedWords):
    if (myWord in Dictionary) or (myWord in Dictionary2):
        return
        
    elif myWord in model:  
        maxcomparison = -1
        closestMatch = myWord
        for Word in Dictionary:
            if Word in model:
                comparison = model.similarity(myWord, Word)
                if comparison > maxcomparison:
                    closestMatch = Word
                    maxcomparison = comparison
        
        Dictionary2[myWord] = Dictionary[closestMatch]

    else:
        maxcomparison = -1
        closestMatch = myWord
        for Word in Dictionary:
            comparison = similar(myWord, Word)
            if comparison > maxcomparison:
                closestMatch = Word
                maxcomparison = comparison
            
        Dictionary2[myWord] = Dictionary[closestMatch]
    
    if closestMatch in RepeatedWords:
        RepeatedWords[closestMatch].append(myWord)
    else:
        RepeatedWords[closestMatch]=[myWord]
    return;


# Feeds the initialized Dictionary with every possible word
def FeedDictionary(Df1, Df2, Dictionary, Dictionary2, RepeatedWords):    
    for index, row in Df1.iterrows():
        Bags = set( StringToList_Nonums(str(row['question1'])) + StringToList_Nonums(str(row['question2'])))
        for word in Bags:
            PossibleNewWord(word, Dictionary, Dictionary2, RepeatedWords) 
     
    for index, row in Df2.iterrows():
        Bags = set(StringToList_Nonums(str(row['question1'])) + StringToList_Nonums(str(row['question2'])))
        for word in Bags:
            PossibleNewWord(word, Dictionary, Dictionary2, RepeatedWords)  


            
## IMPORTANT: Explanation of the variables and quotients in the Dictionaries (/hash tables).
# 'Words': each word of the dictionary
# 'Frequency': Amount of rows from the original document in which 'word' appears,
# 'Frequency1': Amount of rows in which 'word' appears in exactly one of the two questions,
# 'NonDuplicate': Amount of times in which 'word' appears in exactly one of the two questions AND the questions are non duplicate,
# 'Quotient1': NonDuplicate/Frequency1. 
# 'Frequency2': Amount of times in which 'word' appears in both questions
# 'Quotient2': Duplicate/Frequency2.
# 'Duplicate': Amount of times in which 'word' appears in both questions AND the questions are duplicate
def CreateDictionary(Df1, Df2): #  Creates a Dictionary (hash table) that contains every possible word from Df1, Df2 and rates them 
    Dictionary = {}    
    Dictionary2 = {}
    RepeatedWords = {}
    
    Frequency1={}        
    Frequency2={}
    Duplicate={}
    NonDuplicate={}

    for index, row in Df1.iterrows():
        isduplicate = int(row['is_duplicate'])
        Bag1= StringToList_Nonums( str(row['question1']) )
        Bag2= StringToList_Nonums( str(row['question2']) )
        MergedSet = set(FuzzyUnion(Bag1, Bag2))

        if index % 1000==1: ## Just to have an idea of the time left
            clear_output()
            print "Row : ", index-1 , "/", Df1.shape[0]        
        
        
        for word in MergedSet:
            representative = FuzzyFindRepresentative(word, Frequency1)

            if representative == "": ## We do not have a similar word in Frequency1
                representative = word
                Frequency1[representative] = 0
                Frequency2[representative]= 0
                Duplicate[representative]= 0
                NonDuplicate[representative]=0

            if np.logical_xor( FuzzyWordInList(word, Bag1), FuzzyWordInList(word, Bag2)): ## this is (word in Bag1) orex (word in Bag2)
                Frequency1[representative] = Frequency1[representative] +1
                if isduplicate==0:
                    NonDuplicate[representative] = NonDuplicate[representative] +1  

            if FuzzyWordInList(word, Bag1 ) and  FuzzyWordInList(word, Bag2):             ## The word is in both bags
                Frequency2[representative] = Frequency2[representative] +1
                if isduplicate==1: ## If a word is in both questions we check if that somehow implies both questions match.
                    Duplicate[representative] = Duplicate[representative] +1            


    ## We now filter the words in which we do not have a lot of information and create our Dictionary        
    for word in Frequency1:
        if (Frequency1[word]>20) and (Frequency2[word]>20):
            Quotient1 = float(NonDuplicate[word])/Frequency1[word]
            Quotient2 = float(Duplicate[word])/Frequency2[word]
            Quotient1 = float("%.3f" % Quotient1)
            Quotient2 = float("%.3f" % Quotient2)
            Dictionary[word] = [Quotient1, Quotient2]        

    DictionaryToCSV(Dictionary, "MainWords.csv")                       # This creates Dictionary1 that rates many of the words in  trainDf               
    FeedDictionary(Df1, Df2, Dictionary, Dictionary2, RepeatedWords)   # We create a new dictionary (Dictionary2) rating all the words not in Dictionary1

    for word in Dictionary2:                                           # We merge the two dictionaries
        Quotients = Dictionary2[word]
        Dictionary[word]= Dictionary2[word]

    DictionaryToCSV(Dictionary, "Dictionary.csv")                      #We save the enlarged Dictionary to a .csv 
    RepeatedWordsToCSV(RepeatedWords, "RepeatedWords.csv")             #A file containing words that were found to be different
    return Dictionary ## returns the actual hash table



In [None]:
#################################################################################################################
#################################################################################################################
###############################             MAIN                 ################################################
#################################################################################################################
#################################################################################################################

In [None]:
## Loading data 
dftrain = pd.DataFrame.from_csv("/data/train_tiny.csv")
dftest = pd.DataFrame.from_csv("/data/test_tiny.csv")
Dictionary = CreateDictionary(dftrain, dftest); ## Creates Dictionary.csv  SimilarWords.csv

In [None]:
## Creating the dictionary 
#Dictionary = LoadDictionary() ## If Dictionary.csv already exists we just load it.