In [1]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import scipy
import xgboost as xgb
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from pylev import levenshtein
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import word2vec, KeyedVectors
from scipy.stats import kurtosis
from sklearn.externals import joblib
from gensim.corpora import Dictionary
from gensim import corpora, models
import operator
##########################################



In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)
y = data['is_duplicate']
df_train = data
##########################################

In [3]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
df_test = pd.read_csv('test.csv')

#Replaces np.nan with ''
df_test = df_test.replace(np.nan, '', regex=True)

#Saves the cleaned test.csv
# df_test.to_csv('cleaned_test.csv')
##########################################

In [4]:
##########################################
# Initializes variables for Feature Creation
##########################################
stop_words = set(stopwords.words("english"))
model = KeyedVectors.load("300features_10minwords_5context")

def question_to_wordlist(text, remove_stopwords = False):
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split()
    
    if remove_stopwords:
        words = [w for w in words if not w in stop_words]
    return(words)
##########################################

In [5]:
##########################################
# Creates 50 LDA topics
##########################################
q1_list = df_train.question1
q2_list = df_train.question2
total_questions = list(q1_list) + list(q2_list)

#Tokenize each question
questions = [question_to_wordlist(question, remove_stopwords = True) for question in total_questions]

#Create a Gensim dictionary from the questions
dictionary = Dictionary(questions)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#Convert the dictionary to a Bag of Words corpus for reference
corpus = [dictionary.doc2bow(question) for question in questions]

#Train LDA model
topics=50
# lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=topics, workers=150)
##########################################

In [None]:
##########################################
# Saves LDA Model
##########################################
joblib.dump(lda, 'lda_50topics.pkl')
##########################################

In [6]:
##########################################
# Loads LDA Model
##########################################
lda = joblib.load('lda_50topics.pkl')
##########################################

In [7]:
##########################################
# Functions for Re-sorting Topic Words
##########################################
def createTopics(num_topics, num_words, lda_model):
    topics = []
    words_sorted = []
    
    for i in tqdm(xrange(0, num_topics)):
        words = lda_model.show_topic(i, topn=num_words)
        words_list = []
        for j in xrange(0, len(words)):
            words_list.append(words[j][0])
            words_sorted.append(words[j][0])
        topics.append(words_list)
    
    words_resort = [word for word in dictionary.values() if word not in words_sorted]
    return topics, words_resort

def calcMeanSim(word2vec_model, word, topicWords):
    values = []
    for i in range(0, len(topicWords)):
        try:
            values.append(word2vec_model.wv.similarity(word, topicWords[i]))
        except:
            values.append(0)
    mean = np.mean(values)
    return mean

def categorizeWords(word2vec_model, wordList, topics):
    for i in tqdm(xrange(0, len(wordList))):
        mean_vals = []
        for j in xrange(0, len(topics)):
            mean_vals.append(calcMeanSim(word2vec_model, wordList[i], topics[j]))
        index, value = max(enumerate(mean_vals), key=operator.itemgetter(1))
        topics[index].append(wordList[i])
    return topics
##########################################

In [11]:
##########################################
# Recreates Topics List using LDA & Word2Vec
##########################################
topics, words_resort = createTopics(50, 100, lda)
new_topics = categorizeWords(model, words_resort, topics)
##########################################

100%|██████████| 50/50 [00:00<00:00, 45.89it/s]
100%|██████████| 78331/78331 [4:58:45<00:00,  1.94it/s]   


In [12]:
##########################################
# Saves New Topics List
##########################################
joblib.dump(new_topics, 'new_topics.pkl')
##########################################

['new_topics.pkl']

In [10]:
new_topics = joblib.load('new_topics.pkl')

In [None]:
##########################################
# Create Dataframe with % of 50 topics
##########################################
topics_percent = []
for i in tqdm(xrange(0, len(new_topics))):
    topic_curr = []
    vec = CountVectorizer(vocabulary=new_topics[i])
    data = vec.fit_transform(df_train.question1)
    data_array = data.toarray()
    
    for j in xrange(0, len(data_array)):
        topic_curr.append(sum(data_array[j]/float(len(data_array[j])))*100)
    topic_curr.append(topics_curr)
    
copy = topics_percent
topics_percent_df = np.reshape(topics_percent, (len(df_train), 50))
##########################################



  0%|          | 0/50 [00:00<?, ?it/s][A[A
[A