In [318]:
import os
import pandas as pd
from pyLDAvis import gensim as gs
import seaborn as sns

from gensim import corpora
from gensim import models
from itertools import chain
from nltk.stem.wordnet import WordNetLemmatizer
from pymongo import MongoClient

%matplotlib inline

In [319]:
# taken from http://stackoverflow.com/questions/16249736/how-to-import-data-from-mongodb-to-pandas
def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df
    

def pre_process_text(document):
    '''
    input = document, a string
    output = A list of tokens for analysis in gensim
    '''
    
    # To lowercase
    document = document.lower()
    
    # Remove html
    document = strip_fluff(document)
    
    # Tokenize and lemmatize
    sents = sent_detector.tokenize(document)
    tokens = []
    for sent in sents:
        tokens += tokenizor.tokenize(sent)
    tokens = [x for x in tokens if x not in(string.punctuation)]
    tokens_lemmatized = [lmtzr.lemmatize(token) for token in tokens]

    # Get position
    tagset = None
    tokens_lab = tagger.tag(tokens_lemmatized)
    
    # Focus on nouns and remove stopwords
    tokens_selected = [tok[0] for tok in tokens_lab if tok[0] not in words_to_remove and tok[1]=="NN"]
    
    return tokens_selected

def strip_fluff(doc):
    s = strip_html(doc)
    s = strip_url(s)
    s = strip_newline(s)
    s = strip_nbsp(s)
    return s

def strip_html(doc):
    p = re.compile(r'<.*?>')
    return p.sub(' ', doc)

def strip_url(doc):
    return re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', doc)

def strip_newline(doc):
    return doc.replace('\n', ' ')

def strip_nbsp(doc):
    return doc.replace('&nbsp;', ' ')

def strip_html_entity(doc):
    return doc.replace('&amp;')

### Create stop words and read the data from source

In [320]:
# Read in place names from a file, line by line, as a list
with open(os.path.dirname(os.path.dirname(os.getcwd())) + '/data/towns_and_cities_2015.txt') as fp:
    place_names = [line.rstrip('\n') for line in fp]

# Load lemmatizer
lmtzr = WordNetLemmatizer()

### Analyze Business Groups First

In [321]:
# Business group DataFrame
business_meetups = read_mongo('meetup', 'groups')

In [322]:
# List of lists containing group topics
business_group_topics = [[y['name'] for y in x] for x in business_meetups.topics]

# Create dictionary from list above
business_group_dict = corpora.Dictionary(business_group_topics)

# Create bag of words representation
business_group_corpus = [business_group_dict.doc2bow(gr) for gr in business_group_topics]

# Initialise TF-IDF model
business_group_tfidf = models.TfidfModel(business_group_corpus)

# Fit the TF-IDF model
business_group_tfidf_corpus = business_group_tfidf[business_group_corpus]

In [323]:
#Initialise lda model
business_group_lda = models.LdaMulticore(business_group_tfidf_corpus,
                                                                               id2word=business_group_dict,
                                                                               num_topics=25,
                                                                               passes=100,
                                                                               iterations=300,
                                                                               workers=3)

In [324]:
#Create doc topic distribution with the bow corpus
business_doc_topics = business_group_lda[business_group_tfidf_corpus]

In [325]:
# Tech group DataFrame
tech_meetups = read_mongo('meetup_tech_groups', 'groups')

In [326]:
# List of lists containing group topics
tech_group_topics = [[y['name'] for y in x] for x in tech_meetups.topics]

# Create dictionary from list above
tech_group_dict = corpora.Dictionary(tech_group_topics)

# Create bag of words representation
tech_group_corpus = [tech_group_dict.doc2bow(gr) for gr in tech_group_topics]

# Initialise TF-IDF model
tech_group_tfidf = models.TfidfModel(tech_group_corpus)

# Fit the TF-IDF model
tech_group_tfidf_corpus = tech_group_tfidf[tech_group_corpus]

In [327]:
#Initialise lda model
tech_group_lda = models.LdaMulticore(tech_group_tfidf_corpus,
                                                                        id2word=tech_group_dict,
                                                                        num_topics=25,
                                                                        passes=100,
                                                                        iterations=300,
                                                                        workers=3)

In [328]:
#Create doc topic distribution with the bow corpus
tech_doc_topics = tech_group_lda[tech_group_tfidf_corpus]

In [329]:
tech_group_lda.show_topics(num_topics=25)

[(0,
  '0.013*Artificial Intelligence Programming + 0.013*Artificial Intelligence Machine Learning Robotics + 0.008*Elm Programming + 0.007*Videoblogging + 0.005*SEO for Local Business + 0.004*Intellectual Discussion + 0.004*Business Analysis tools and techniques + 0.004*Business Analysis + 0.004*Business Analysts + 0.004*Idris'),
 (1,
  '0.050*Android Development + 0.037*iOS Development + 0.029*iOS + 0.029*Android + 0.028*Mobile Development + 0.021*Mobile Technology + 0.017*HTML + 0.015*iPhone + 0.014*Objective C + 0.010*ElasticSearch'),
 (2,
  '0.040*Technology Startups + 0.039*Entrepreneurship + 0.037*Startup Businesses + 0.034*Technology + 0.034*New Technology + 0.033*Internet Startups + 0.032*Web Technology + 0.028*Mobile Technology + 0.025*Internet Professionals + 0.024*Professional Networking'),
 (3,
  '0.034*WordPress + 0.025*Blogging + 0.024*Wordpress Themes + 0.023*Wordpress Plugins + 0.021*Wordpress Customization + 0.020*WordPress Users + 0.016*Wordpress Help + 0.016*Graph D

In [330]:
business_group_lda.show_topics(num_topics=25)

[(0,
  '0.012*NLP Coaching + 0.010*Internet Startup Idea Group + 0.010*NLP (Neuro-Linguistic Programming) + 0.007*NLP Practice Group + 0.007*Internet Marketing Strategies For Business Owners + 0.007*NLP, Hypnosis, & Success-Life Coaching + 0.006*Saving Money + 0.005*Building an Online Business + 0.005*NLP Coaching For Personal Success + 0.005*Hypnotherapy'),
 (1,
  '0.047*Leadership + 0.045*Public Speaking + 0.041*Communication Skills + 0.040*Personal Development + 0.036*Fear of Public Speaking + 0.035*Presentations + 0.035*Self-Improvement + 0.032*Self-Empowerment + 0.030*Professional Development + 0.028*Communication'),
 (2,
  '0.029*Sales + 0.019*Sales Professionals + 0.016*Sales Training + 0.016*Presentation Skills + 0.014*Healthcare Innovation + 0.013*Thinking of Starting Your Own Company + 0.012*Healthcare Professionals + 0.010*Healthcare IT + 0.010*Marketing and Sales + 0.010*Medical Device Technology'),
 (3,
  '0.028*Online Marketing + 0.024*E-Business Owners + 0.022*Marketing 

In [331]:
tech_group_vis_data = gs.prepare(tech_group_lda, tech_group_tfidf_corpus, tech_group_dict)

In [332]:
pyLDAvis.display(tech_group_vis_data)

In [333]:
# Lookup topic names
tech_topic_dict = {
    0: 'Web Development',
    1: 'Tech Entrepreneurs',
    2: 'Data Analytics',
    3: 'Agile Workflows',
    4: 'Design',
    5: 'Cloud Computing',
    6: 'Technology Professionals',
    7: 'Game Development',
    8: 'Internet of Things',
    9: 'NoSQL',
    10: 'Makers',
    11: 'WordPress',
    12: 'DevOps',
    13: 'Virtual Reality',
    14: 'Mobile App Development',
    15: 'Functional Programming',
    16: 'Cryptographic Currencies',
    17: 'User Experience',
    18: 'Python',
    19: 'Security',
    20: 'Distributed Computing',
    21: 'SEO',
    22: 'Scrum',
    23: 'Hackathon',
    24: 'Open Data'
}

In [334]:
business_group_vis_data = gs.prepare(business_group_lda, business_group_tfidf_corpus, business_group_dict)

In [335]:
pyLDAvis.display(business_group_vis_data)

In [336]:
# Create DFs with the topic distributions per group
tech_group_topdist = pd.DataFrame([dict(tup) for tup in tech_doc_topics]).fillna(value=0)
business_group_topdist = pd.DataFrame([dict(tup) for tup in business_doc_topics]).fillna(value=0)

In [337]:
# DataFrame with topic integer values based on topic with highest value
tech_group_lda_topics = pd.DataFrame(data={'top_topic': tech_group_topdist.idxmax(axis=1)})

# Add new column with string descriptions
tech_group_lda_topics['topic_names'] = tech_group_lda_topics.apply (lambda row: tech_topic_dict[row['top_topic']], axis=1)

In [338]:
tech_output_df = pd.concat((tech_group_lda_topics, tech_meetups), axis=1)

In [345]:
tech_output_df[['topic_names', 'description', 'topics']]

Unnamed: 0,topic_names,description,topics
0,Internet of Things,"Do you JavaScript, started the journey to ECMA...","[{'name': 'JavaScript', 'urlkey': 'javascript'..."
1,Data Analytics,<p>The meetup where people listen. We cut back...,"[{'name': 'Open Source', 'urlkey': 'opensource..."
2,Internet of Things,"<p>Meet PHP users, developers and recruiters n...","[{'name': 'PHP', 'urlkey': 'php', 'id': 455}, ..."
3,Data Analytics,Also known as the Flag and Bell Pub Crawl for ...,"[{'name': 'Web Technology', 'urlkey': 'web', '..."
4,Data Analytics,<p>MiniBar - London</p>\n<p>Your Monthly Face ...,"[{'name': 'Web Design', 'urlkey': 'webdesign',..."
5,Virtual Reality,<p>A number of members are now doing business ...,"[{'name': 'Marketing', 'urlkey': 'marketing', ..."
6,Hackathon,Vi er en gruppe som eksistere for at snakke da...,"[{'name': 'Scandinavian Languages', 'urlkey': ..."
7,Data Analytics,,"[{'name': 'New Technology', 'urlkey': 'newtech..."
8,Agile Workflows,<p>Meet other local Bloggers to talk about blo...,"[{'name': 'WordPress', 'urlkey': 'wordpress', ..."
9,Internet of Things,"<p>ELEVATOR PITCH <br>Gathering .NET coders, h...","[{'name': '.NET', 'urlkey': 'dotnet', 'id': 827}]"
