In [32]:
%matplotlib inline

import os
import pandas as pd
import seaborn as sns
import pyLDAvis

from datetime import datetime
from gensim import corpora
from gensim import models
from itertools import chain
from nltk.stem.wordnet import WordNetLemmatizer
from pyLDAvis import gensim as gs
from pymongo import MongoClient

In [2]:
# taken from http://stackoverflow.com/questions/16249736/how-to-import-data-from-mongodb-to-pandas
def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df
    

def pre_process_text(document):
    '''
    input = document, a string
    output = A list of tokens for analysis in gensim
    '''
    
    # To lowercase
    document = document.lower()
    
    # Remove html
    document = strip_fluff(document)
    
    # Tokenize and lemmatize
    sents = sent_detector.tokenize(document)
    tokens = []
    for sent in sents:
        tokens += tokenizor.tokenize(sent)
    tokens = [x for x in tokens if x not in(string.punctuation)]
    tokens_lemmatized = [lmtzr.lemmatize(token) for token in tokens]

    # Get position
    tagset = None
    tokens_lab = tagger.tag(tokens_lemmatized)
    
    # Focus on nouns and remove stopwords
    tokens_selected = [tok[0] for tok in tokens_lab if tok[0] not in words_to_remove and tok[1]=="NN"]
    
    return tokens_selected

def strip_fluff(doc):
    s = strip_html(doc)
    s = strip_url(s)
    s = strip_newline(s)
    s = strip_nbsp(s)
    return s

def strip_html(doc):
    p = re.compile(r'<.*?>')
    return p.sub(' ', doc)

def strip_url(doc):
    return re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', doc)

def strip_newline(doc):
    return doc.replace('\n', ' ')

def strip_nbsp(doc):
    return doc.replace('&nbsp;', ' ')

def strip_html_entity(doc):
    return doc.replace('&amp;')

### Create stop words and read the data from source

In [3]:
# Read in place names from a file, line by line, as a list
with open(os.path.dirname(os.path.dirname(os.getcwd())) + '/data/towns_and_cities_2015.txt') as fp:
    place_names = [line.rstrip('\n') for line in fp]

# Load lemmatizer
lmtzr = WordNetLemmatizer()

### Analyze Business Groups First

In [4]:
# Business group DataFrame
business_meetups = read_mongo('meetup', 'groups')

In [5]:
# List of lists containing group topics
business_group_topics = [[y['name'] for y in x] for x in business_meetups.topics]

# Create dictionary from list above
business_group_dict = corpora.Dictionary(business_group_topics)

# Create bag of words representation
business_group_corpus = [business_group_dict.doc2bow(gr) for gr in business_group_topics]

# Initialise TF-IDF model
business_group_tfidf = models.TfidfModel(business_group_corpus)

# Fit the TF-IDF model
business_group_tfidf_corpus = business_group_tfidf[business_group_corpus]

In [6]:
#Initialise lda model
business_group_lda = models.LdaMulticore(business_group_tfidf_corpus,
                                                                               id2word=business_group_dict,
                                                                               num_topics=25,
                                                                               passes=100,
                                                                               iterations=300,
                                                                               workers=3)

In [7]:
#Create doc topic distribution with the bow corpus
business_doc_topics = business_group_lda[business_group_tfidf_corpus]

In [8]:
# Tech group DataFrame
tech_meetups = read_mongo('meetup_tech_groups', 'groups')

In [9]:
# List of lists containing group topics
tech_group_topics = [[y['name'] for y in x] for x in tech_meetups.topics]

# Create dictionary from list above
tech_group_dict = corpora.Dictionary(tech_group_topics)

# Create bag of words representation
tech_group_corpus = [tech_group_dict.doc2bow(gr) for gr in tech_group_topics]

# Initialise TF-IDF model
tech_group_tfidf = models.TfidfModel(tech_group_corpus)

# Fit the TF-IDF model
tech_group_tfidf_corpus = tech_group_tfidf[tech_group_corpus]

In [10]:
#Initialise lda model
tech_group_lda = models.LdaMulticore(tech_group_tfidf_corpus,
                                                                        id2word=tech_group_dict,
                                                                        num_topics=25,
                                                                        passes=100,
                                                                        iterations=300,
                                                                        workers=3)

In [11]:
#Create doc topic distribution with the bow corpus
tech_doc_topics = tech_group_lda[tech_group_tfidf_corpus]

In [12]:
tech_group_lda.show_topics(num_topics=25)

  def _ipython_display_formatter_default(self):
  def _singleton_printers_default(self):


[(0,
  '0.038*Internet of Things + 0.025*Smart Sensors + 0.023*Information Security + 0.021*Computer Security + 0.021*IOT hacking + 0.016*Cybersecurity + 0.015*Sensors + 0.015*M2M + 0.015*Smart Home + 0.015*Network Security'),
 (1,
  '0.042*Makers + 0.040*Makerspaces + 0.039*Arduino + 0.035*Electronics + 0.029*Hacking + 0.027*Robotics + 0.026*3D Printing + 0.021*DIY (Do It Yourself) + 0.021*Raspberry Pi + 0.016*Microcontrollers'),
 (2,
  '0.027*Coders + 0.026*hackathon + 0.020*Social Coding + 0.018*hackathons + 0.017*Hacking + 0.014*Startup Pitching + 0.011*Angular + 0.010*Mobile App Development + 0.010*Hack + 0.008*Developing Mobile Apps'),
 (3,
  '0.020*Python + 0.014*Civic Engagement & Technology + 0.014*Django + 0.013*Open Government + 0.012*Open Data + 0.009*Java Server Side + 0.008*Python Web Development + 0.008*Small Business Networking + 0.008*Civic Hacking + 0.008*Java Virtual Machine'),
 (4,
  '0.024*Product Design + 0.022*Product Management + 0.019*Product Development + 0.01

In [13]:
business_group_lda.show_topics(num_topics=25)

[(0,
  '0.018*Business Owners + 0.009*Cashflow Game + 0.008*Rich Dad Cashflow Club + 0.005*Rich Dad, Poor Dad + 0.005*Financial Education + 0.005*Java + 0.005*Business Support + 0.004*Robert Kiyoaski + 0.004*Rich Dad + 0.004*Tax'),
 (1,
  '0.019*Personal Development for Women + 0.012*Professional Development for Women + 0.009*Emotional Intelligence + 0.009*Independent Filmmaking + 0.009*Change management + 0.008*Corporate Communications + 0.008*toastmasters is the proven way to public speaking + 0.008*Change Leadership + 0.007*Public Relations + 0.007*organizational development'),
 (2,
  '0.029*Social + 0.022*Dining Out + 0.022*Fun Times + 0.017*New In Town + 0.016*Social Networking + 0.013*Nightlife + 0.012*Wellness + 0.011*Healthy Living + 0.009*Eating, Drinking, Talking, Laughing, Etc + 0.009*Fitness'),
 (3,
  '0.011*Funding + 0.009*Business and Career Networking + 0.008*Group Coaching + 0.008*Inspirational + 0.008*Creating a Successful Business + 0.006*Networking For the Self Emplo

In [14]:
tech_group_vis_data = gs.prepare(tech_group_lda, tech_group_tfidf_corpus, tech_group_dict)

In [17]:
pyLDAvis.display(tech_group_vis_data)

In [18]:
# Lookup topic names
tech_topic_dict = {
    0: 'Web Development',
    1: 'Tech Entrepreneurs',
    2: 'Data Analytics',
    3: 'Agile Workflows',
    4: 'Design',
    5: 'Cloud Computing',
    6: 'Technology Professionals',
    7: 'Game Development',
    8: 'Internet of Things',
    9: 'NoSQL',
    10: 'Makers',
    11: 'WordPress',
    12: 'DevOps',
    13: 'Virtual Reality',
    14: 'Mobile App Development',
    15: 'Functional Programming',
    16: 'Cryptographic Currencies',
    17: 'User Experience',
    18: 'Python',
    19: 'Security',
    20: 'Distributed Computing',
    21: 'SEO',
    22: 'Scrum',
    23: 'Hackathon',
    24: 'Open Data'
}

In [19]:
business_group_vis_data = gs.prepare(business_group_lda, business_group_tfidf_corpus, business_group_dict)

In [20]:
pyLDAvis.display(business_group_vis_data)

In [21]:
# Create DFs with the topic distributions per group
tech_group_topdist = pd.DataFrame([dict(tup) for tup in tech_doc_topics]).fillna(value=0)
business_group_topdist = pd.DataFrame([dict(tup) for tup in business_doc_topics]).fillna(value=0)

In [22]:
# DataFrame with topic integer values based on topic with highest value
tech_group_lda_topics = pd.DataFrame(data={'top_topic': tech_group_topdist.idxmax(axis=1)})

# Add new column with string descriptions
tech_group_lda_topics['topic_names'] = tech_group_lda_topics.apply (lambda row: tech_topic_dict[row['top_topic']], axis=1)

In [23]:
tech_output_df = pd.concat((tech_group_lda_topics, tech_meetups), axis=1)

In [40]:
out_dir = os.path.dirname(os.path.dirname(os.getcwd())) + '/data/meetup/{}_tech.csv'
tech_output_df[['category', 'city', 'country', 'lat', 'lon', 'topic_names', 'description', 'topics']].to_csv(out_dir.format(datetime.now().strftime('%Y%m%d%H')))