# Extract Meetup event data

## Tasks:
1. Imports and preliminaries
2. Load group ids
3. Data crawl
 * Extract relevant events
 * Extract relevant RSVPs

## 1. Imports and preliminaries

In [3]:
#Imports
import re
import json
import requests
import urllib
from urllib.request import urlopen
import random
import datetime
import ratelim
import os

#Import gensim
import gensim
from gensim import corpora
from gensim import models
import nltk
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import words

#Import Matt Williams' script (adapted to Python 3)
#import tools_mu
#import crawl_group_activity

In [4]:
#File paths
#Path to data files
data_path = os.getcwd() + "/" + "meetup_data"

#Read api key from config file 'my_api_key'
with open("my_api_key.json",'r') as data_file:
    my_api_key = json.load(data_file)['api_key']

## 2. Load group ids

In [5]:
#Extract json data.

#Observations: json loads works with strings so we had to decode the lines.
tech_groups = [json.loads(line.decode()) for line in open(data_path + "/" + "tech_groups.json","rb")]

#Subset groups in Cardiff - we'll work with these, and a random set.
cardiff_group_ids = [g['id'] for g in tech_groups if g['city']=="Cardiff"]

#Extract random sample of 200 groups
#Extract 200 random indices based on the tech_groups list
random_groups = random.sample(range(0,len(tech_groups)),200)

#Get the ids for those
random_group_ids = [g['id'] for num,g in enumerate(tech_groups) if num in random_groups]

#Final ids (use set to remove duplicates)
selected_groups_ids = set(cardiff_group_ids + random_group_ids)

## 3. Data crawl

In [6]:
#Meetup API url
api_base_url = "https://api.meetup.com/2/"

#Rate limits
RATELIM_DUR = 60 * 60
RATELIM_QUERIES = 9000

In [7]:
#Write event crawler
@ratelim.patient(RATELIM_QUERIES,RATELIM_DUR)
def crawl_events(group_id):
    '''
    Input: a meetup group id
    Output: a json object with information about the group from Meetup
    '''
    
    #Build request
    api_url = api_base_url + 'events'
    request_parameters = "?group_id={}&status=past&key={}".format(group_id,my_api_key)
    
    event_request = api_url + request_parameters
    
    #Make API call and obtain response using the get method in requests.
    response = requests.get(event_request)
    
    return(response.json())

In [8]:
#Extract events
%timeit

#This is a list of dicts, each of which has a group and its attendees
selected_events = [crawl_events(gid)['results'] for gid in selected_groups_ids]

#Extract event ids
selected_event_ids = [event['id'] for all_events in selected_events for event in all_events]

In [9]:
#Write RSVP crawler
@ratelim.patient(RATELIM_QUERIES,RATELIM_DUR)
def crawl_rspvs(event_id):
    '''
    Input: event_id
    Output: a json object with information about attendees
    '''
    
    #Build request:
    api_url = api_base_url + "rsvps"
    request_parameters = "?&event_id={}&key={}".format(event_id,my_api_key)
    
    rsvps_request = api_url + request_parameters
    
    #Make request
    response = requests.get(rsvps_request)
    
    return(response.json())
    

In [8]:
#Extract rsvps for each group/event.
#Want to end with a list where every element is a group with events and rsvps
#%time

#selected_rsvps = [crawl_rspvs('results') for eid in selected_event_ids]

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 25 µs


## 4. Exploratory analysis
### a. Event description

#### Tasks
* Extract topic descriptions, dates and groups.
* Preliminary analysis of descriptions
* Preprocessing (lemmatize, stem, tokenize)
* Model and measure

In [10]:
#Create function that extracts an element from a dict only if that key exists
def extract_element(dictionary,key):
    '''
    Input: a dictionary and a key.
    Output: The value of the key if it exists; "NA" if not.
    '''
    try:
        out = dictionary[key]
    except:
        out = "No Key"
    return(out)

In [11]:
#Extract event descriptions
event_desc = [extract_element(event,"description") for group in selected_events for event in group]

#Also extract group cities. We will remove them later from text descriptions
group_cities = set([g['city'].lower() for g in tech_groups if g['id'] in selected_groups_ids])

In [12]:
#Load stop words.
stop_words = stopwords.words('english')
english_words = words.words()
#Load lemmatizer to lemmatize words
lmtzr = WordNetLemmatizer()

#Create list of words to remove
words_to_remove = list(set(stop_words+list(group_cities)+english_words))

In [13]:
def clean_html(raw_html):
    cleanr =re.compile('<.*?>')
    cleantext = re.sub(cleanr,'', raw_html)
    return(cleantext)

In [14]:
def pre_process_text(document):
    '''
    input = document, a string
    output = A list of tokens for analysis in gensim
    '''
    
    #To lowercase
    doc_low = document.lower()
    
    #Remove html
    s_no_html = re.sub('<[^<]+?>', '', doc_low)
    
    #Remove punctuation
    s_no_punct = "".join([w for w in s_no_html if w not in string.punctuation])
    
    
    #Tokenize and lemmatize
    tokens = nltk.word_tokenize(s_no_punct)
    tokens_lemmatized = [lmtzr.lemmatize(token) for token in tokens]
    
    
    #Get position
    tokens_lab = nltk.pos_tag(tokens_lemmatized)
    
    #Focus on nouns and remove stopwords
    tokens_selected = [tok[0] for tok in tokens_lab if tok[1]=="NN" and 
                      tok[0] not in words_to_remove]
    
    return(tokens_selected)

In [16]:
#Process all text
event_description_corpus = [pre_process_text(doc) for doc in event_desc]

In [18]:
#Create a dictionary of unique tokens
dictionary = corpora.Dictionary(event_description_corpus)
corpus = [dictionary.doc2bow(doc) for doc in event_description_corpus]

In [19]:
tf_idf = models.TfidfModel(corpus)
tf_idf_corpus = tf_idf[corpus]

In [22]:
lda_model = models.LdaModel(tf_idf_corpus, id2word=dictionary, num_topics=20,passes=10,iterations=150)

In [23]:
#Alas, this doesn't seem to work
lda_model.show_topics(num_topics=5)

[(13,
  '0.024*youre + 0.023*django + 0.021*brian + 0.020*conversocial + 0.018*castletypething + 0.013*gareth + 0.013*startup + 0.007*bdd + 0.006*qampa + 0.005*dennisdropin'),
 (14,
  '0.026*eresearch + 0.024*centre + 0.019*meetup + 0.019*wednesday + 0.010*tbc + 0.006*lineup + 0.005*october + 0.005*crowdsource + 0.005*gettogether + 0.005*rsvp'),
 (7,
  '0.044*benugo + 0.015*organiser + 0.015*amp + 0.014*cheerspaul + 0.014*gitter + 0.007*meetup + 0.005*techie + 0.005*sophie + 0.005*networking + 0.004*•'),
 (9,
  '0.075*xpday + 0.009*php + 0.007*janice + 0.006*feb + 0.006*harvey + 0.005*max + 0.005*dropin + 0.004*bitcoin + 0.004*ed + 0.004*that’s'),
 (0,
  '0.022*startup + 0.016*• + 0.015*app + 0.015*google + 0.013*apps + 0.012*webinar + 0.009*im + 0.009*blogger + 0.008*youll + 0.008*meetup')]

### b. Group keywords
#### Tasks
* Extract keywords and date created from groups
* Option 1: bin keywords by month and perform tf-idf. What are the top keywords per month?
* Option 2: topic model all keywords with large n and explore topic distribution per month?

In [10]:
def extract_topics_from_dict(topic,container):
    '''
    input: a container (list) of dictionaries and a key (topic) to extract
    output: a list with the topics
    '''
    
    out = [top[topic] for top in container]
    return(out)

def extract_date_from_epoch(posix_date):
    '''
    input: a POSIX timestamp.
    output: a local date
    '''
    out = datetime.datetime.fromtimestamp(posix_date).strftime("%d-%m-%Y")
    
    return(out)


In [11]:
#Extract keywords and dates created from groups

group_topics = [{"group_id": g["_id"],
                 "group_created": extract_date_from_epoch(
            int(g["created"]['$numberLong'])/1000),
                   "group_topics":extract_topics_from_dict('urlkey',g['topics'])} for
                g in tech_groups]

In [12]:
#Binning
import pandas as pd

In [13]:
group_topics[2]

{'group_created': '03-01-2006',
 'group_id': 218194,
 'group_topics': ['php',
  'opensource',
  'softwaredev',
  'edtech',
  'newtech',
  'ria',
  'internetpro',
  'lampsoftware',
  'web',
  'drupal',
  'technology',
  'web-development',
  'cms',
  'computer-programming']}

In [14]:
#Create dataframe and use groupby
group_topics_df = pd.DataFrame(group_topics)

#Create a month bin where day is always one
group_topics_df['created_date'] = group_topics_df[
    'group_created'].apply(lambda x:
                           datetime.datetime.strptime("01-" + x[3:],"%d-%m-%Y"))    

#Bin over created_date
topics_by_month = group_topics_df.groupby('created_date')['group_topics'].apply(lambda x:
                                                               [t for group in x for t in group])

#Remove keywords that only appear one
def remove_vrare_keywords(keyword_list,threshold=1):
    '''
    input: list of keywords and a threshold for inclusion
    output: counts the number of keyword occurrences in the period and removes
        #those that appear below the threshold
    '''
    topic_subset = [topic for topic in keyword_list if keyword_list.count(topic)>threshold]
    return(topic_subset)

topics_by_month = topics_by_month.apply(remove_vrare_keywords)

In [15]:
#Next steps:
#Create a corpus where periods are documents, and transform via tf idf.

#Create the dictionary.
month_keyword_dictionary = corpora.Dictionary(topics_by_month)

#Create the bag of words
month_keyword_bow = [month_keyword_dictionary.doc2bow(doc) for doc in topics_by_month]

#Apply tf-idf transformation
#Initialise model to learn term document frequencies
month_keyword_tfidf = models.TfidfModel(month_keyword_bow)
month_kword_tfidf_fit = month_keyword_tfidf[month_keyword_bow]

In [16]:
#Map token ids to keywords and extract top 5 per month.

#This dictionary maps token names to id integers
token_ids = month_keyword_dictionary.token2id


#Lookup function between token ids and tokens
def obtain_token_from_id(token_id):
    '''
    input: an id for a token
    output" the token (group keyword)
    '''
    
    #Converts the token ids keys into a list selected by the index of the token_id in
        #the values list
    my_token = list(token_ids.keys())[list(token_ids.values()).index(token_id)]
    return(my_token)     

In [17]:
#Function to extract top topics by month

def obtain_top_tokens_by_month(month_values,threshold=5):
    '''
    input: the tf-idf values for tokens in one month
    returns: the top 10 tokens
    
    '''
    #Create dataframe for sorting easily
    df = pd.DataFrame(month_values,columns=['id','weight'])
    df.sort(columns="weight",inplace=True,ascending=False)
    df.reset_index(drop=True,inplace=True)
    
    #Extract tokens
    df['topic'] = df['id'].apply(lambda x: obtain_token_from_id(x))
    
    if len(df)-1<threshold:
        return(", ".join(list(df.ix[:len(df)-1,'topic'])))
    else:
        return(", ".join(list(df.ix[:threshold-1,'topic'])))

In [18]:
top_tokens_monthly = [
    {"month":topics_by_month.index[i],
     "top_topics":obtain_top_tokens_by_month(j)}
    for i,j in enumerate(list(month_kword_tfidf_fit))]



In [19]:
#Still not super-informative
pd.DataFrame(top_tokens_monthly)

Unnamed: 0,month,top_topics
0,2002-10-01,
1,2006-01-01,
2,2006-04-01,
3,2006-10-01,
4,2007-01-01,
5,2007-02-01,
6,2007-06-01,
7,2007-07-01,
8,2007-11-01,"computer-programming, softwaredev"
9,2008-01-01,


In [20]:
#Option 2: LDA with all keywords.
#Create list of documents with keywords
group_topics_list = [g['group_topics'] for g in group_topics]

#Create dictionary.
gr_keyword_dictionary = corpora.Dictionary(group_topics_list)

#Create bag of words representation
gr_keyword_corpus = [gr_keyword_dictionary.doc2bow(gr) for gr in group_topics_list]

#Initialise tf-idf model
gr_keyword_tfidf = models.TfidfModel(gr_keyword_corpus)

#Fit tf-idf model
gr_tfidf_corpus = gr_keyword_tfidf[gr_keyword_corpus]

#Initialise lda model
gr_keyword_lda = models.LdaModel(gr_tfidf_corpus,
                                id2word=gr_keyword_dictionary,num_topics=100,
                                passes=20,iterations=300)

In [21]:
gr_keyword_lda.show_topics(num_topics=100)

[(0,
  '0.078*it-professionals + 0.029*web-new-technology + 0.024*user-group + 0.020*environment + 0.019*scrum-development + 0.017*small-business-marketing-strategy + 0.012*cleanweb + 0.010*cross-mobile-development + 0.010*sustainability + 0.009*kong'),
 (1,
  '0.046*mobile-app-development + 0.033*developing-mobile-apps + 0.016*windows-mobile + 0.015*websocket + 0.012*basics-of-iphone-app-development + 0.012*startup-businesses + 0.012*html5 + 0.012*new-product-development-software-tech + 0.011*segway + 0.011*solidworks'),
 (2,
  '0.054*cms + 0.044*webstandards + 0.042*drupal + 0.040*ria + 0.038*javascript-libraries + 0.037*web-development + 0.034*web + 0.033*javascript + 0.032*php + 0.029*internetpro'),
 (3,
  '0.090*ux-design + 0.089*user-experience + 0.081*ixd + 0.064*usability + 0.054*ui-design + 0.050*user-research + 0.041*mobile-user-experience + 0.041*ia + 0.031*webdesign + 0.031*user-experience-design'),
 (4,
  '0.049*database-development + 0.047*ebizowners + 0.043*jvm-languages