In [159]:
#Imports
import re
import json
import requests
import urllib
from urllib.request import urlopen
import random
import datetime
import ratelim
import os

#Import gensim
import gensim
from gensim import corpora
from gensim import models
import nltk
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tag.perceptron import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer

import pandas as pd
from pymongo import MongoClient

# Load tagger once instead of unpickling from disk
# http://stackoverflow.com/questions/11610076/slow-performance-of-pos-tagging-can-i-do-some-kind-of-pre-warming
tagger = PerceptronTagger()

# Same with tokenizer
# http://billchambers.me/tutorials/2015/01/14/python-nlp-cheatsheet-nltk-scikit-learn.html
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

tokenizor = TreebankWordTokenizer()

In [225]:
# taken from http://stackoverflow.com/questions/16249736/how-to-import-data-from-mongodb-to-pandas
def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df
    

def pre_process_text(document):
    '''
    input = document, a string
    output = A list of tokens for analysis in gensim
    '''
    
    #To lowercase
    document = document.lower()
    
    #Remove html
    document = strip_fluff(document)
    
    #Tokenize and lemmatize
    sents = sent_detector.tokenize(document)
    tokens = []
    for sent in sents:
        tokens += tokenizor.tokenize(sent)
    tokens = [x for x in tokens if x not in(string.punctuation)]
    tokens_lemmatized = [lmtzr.lemmatize(token) for token in tokens]

    #Get position
    tagset = None
    tokens_lab = tagger.tag(tokens_lemmatized)
    
    #Focus on nouns and remove stopwords
    tokens_selected = [tok[0] for tok in tokens_lab if tok[0] not in words_to_remove and tok[1]=="NN"]
    
    return tokens_selected

def strip_fluff(doc):
    s = strip_html(doc)
    s = strip_url(s)
    s = strip_newline(s)
    s = strip_nbsp(s)
    return s

def strip_html(doc):
    p = re.compile(r'<.*?>')
    return p.sub(' ', doc)

def strip_url(doc):
    return re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', doc)

def strip_newline(doc):
    return doc.replace('\n', ' ')

def strip_nbsp(doc):
    return doc.replace('&nbsp;', ' ')

def strip_html_entity(doc):
    return doc.replace('&amp;')

In [None]:
with open

In [63]:
business_meetups = read_mongo('meetup', 'groups')
tech_meetups = read_mongo('meetup_tech_groups', 'groups')

### Business Groups First

In [216]:
business_meetups = read_mongo('meetup', 'groups')

# Extract group cities. We will remove them later from text descriptions
group_cities = list(set(business_meetups.city.str.lower()))

# Load stop words.
stop_words = stopwords.words('english')

# Lets remove all english words
english_words = words.words()

# Day names
days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

#Load lemmatizer to lemmatize words
lmtzr = WordNetLemmatizer()

#Create list of words to remove
words_to_remove = set(stop_words+list(group_cities)+english_words+['meetup', 'meet-up']+days)

In [227]:
# A list of group descriptions
group_descriptions = [strip_fluff(str(x)) for x in business_meetups.description]

#Process all text
group_description_corpus = [pre_process_text(doc) for doc in group_descriptions]

#Create a dictionary of unique tokens
dictionary = corpora.Dictionary(group_description_corpus)
corpus = [dictionary.doc2bow(doc) for doc in group_description_corpus]

tf_idf = models.TfidfModel(corpus)
tf_idf_corpus = tf_idf[corpus]

In [228]:
lda_model = models.LdaModel(tf_idf_corpus, id2word=dictionary, num_topics=25, passes=10, iterations=50)
lda_model.show_topics(num_topics=25)

[(0,
  '0.014*cambridgeshire + 0.013*face-to-face + 0.012*co-founder + 0.010*xx + 0.010*amp + 0.007*lavinia + 0.006*database + 0.006*we’ll + 0.006*responsivecoffee + 0.006*owner/manager'),
 (1,
  '0.030*• + 0.022*forex + 0.017*don’t + 0.015*“ + 0.011*app + 0.010*coffee/glass + 0.010*amp + 0.008*organisation + 0.008*john + 0.007*asc'),
 (2,
  '0.039*bitcoin + 0.021*pma + 0.014*blockchain + 0.013*part-time + 0.013*internet + 0.011*amp + 0.011*and/or + 0.011*wider + 0.010*· + 0.009*uk'),
 (3,
  "0.049*socialise + 0.014*email + 0.011*'nlp + 0.009*ltd + 0.007*hubspot + 0.006*co-worker + 0.006*bbc + 0.005*pricing + 0.005*organising + 0.005*mba"),
 (4,
  '0.228*networking + 0.033*planning + 0.022*you’ll + 0.018*ipse + 0.017*stimulating + 0.015*amp + 0.012*freelancer + 0.012*organisation + 0.010*jci + 0.009*ebay'),
 (5,
  '0.014*play® + 0.009*vibe + 0.009*lego® + 0.007*bos + 0.006*facebook + 0.006*amp + 0.006*geekeasy + 0.006*centred + 0.006*salesforce.com + 0.006*multi-jurisdiction'),
 (6,
  

### Tech Groups

In [231]:
# Extract group cities. We will remove them later from text descriptions
tech_group_cities = list(set(tech_meetups.city.str.lower()))

tech_meetups = read_mongo('meetup_tech_groups', 'groups')

# Extract group cities. We will remove them later from text descriptions
tech_group_cities = list(set(tech_meetups.city.str.lower()))

# A list of group descriptions
tech_group_descriptions = [strip_fluff(str(x)) for x in tech_meetups.description]

#Process all text
tech_group_description_corpus = [pre_process_text(doc) for doc in tech_group_descriptions]

#Create a dictionary of unique tokens
tech_dictionary = corpora.Dictionary(tech_group_description_corpus)
tech_corpus = [tech_dictionary.doc2bow(doc) for doc in tech_group_description_corpus]

tech_tf_idf = models.TfidfModel(tech_corpus)
tech_tf_idf_corpus = tf_idf[tech_corpus]

In [232]:
tech_lda_model = models.LdaModel(tech_tf_idf_corpus, id2word=tech_dictionary, num_topics=30, passes=30, iterations=100)
tech_lda_model.show_topics(num_topics=100)

[(0,
  '0.064*blog + 0.040*blogger + 0.024*reporting + 0.016*django + 0.016*kid + 0.016*haskell + 0.012*github + 0.012*demo + 0.011*startup + 0.010*etc'),
 (1,
  '0.358*software + 0.017*api + 0.016*laptop + 0.012*automation + 0.011*amp + 0.011*arduino + 0.011*lunchtime + 0.008*we’re + 0.008*consultancy + 0.007*virtualization'),
 (2,
  '0.046*facebook + 0.037*website + 0.032*processing + 0.019*organiser + 0.015*elasticsearch + 0.015*we’re + 0.015*software + 0.012*logstash + 0.011*paas + 0.010*inc.'),
 (3,
  '0.108*networking + 0.063*amp + 0.016*sharing + 0.015*jboss + 0.011*java + 0.011*software + 0.011*pentaho + 0.009*copywriter + 0.009*hr + 0.008*workload'),
 (4,
  '0.039*scotland + 0.019*software + 0.019*healthcare + 0.017*c++ + 0.013*ethereum + 0.012*powershell + 0.010*magento + 0.010*java + 0.009*accu + 0.009*magento2'),
 (5,
  '0.062*cm + 0.044*linux + 0.021*amazon + 0.015*hackspace + 0.015*html + 0.011*ibm + 0.011*sencha + 0.010*javascript + 0.008*mk + 0.007*sussex'),
 (6,
  '0.0