# Example: Meetup

## Findings:
1) The average number of people RSVPed for a Meetup event was 14.
2) Half of the Meetup events had a number of RSVPs between 4 and 8.
3) There are some words that come up more often than others in the event titles (women, Denver, happy hour, networking, meditation, code). However, the presence of certain words seems to change based on the time of the week the data is scraped. For example, once Friday events were included in the data I scraped, a common word was "poker." "Poker" did not show up at all in the Thursday events. The opposite was true of the word "code."
4) Overall, average number of RSVPs was higher when the words "happy hour" or "networking" were included in the event title.
5) However, the events with "meditation" in the event title had very low relative attendance.

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess


class MUSpider(scrapy.Spider):

    name = "MUS"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.meetup.com/find/events/?allMeetups=true&radius=5&userFreeform=Denver%2C+CO&mcId=z80212&mcName=Denver%2C+CO&eventFilter=all',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for event in response.xpath('//*[@class="row event-listing clearfix doc-padding  "]'):
            
  
            yield {

                "meetup_title": event.xpath("div[2]/div/a/span[@itemprop='name']/text()").extract(),
                "time": event.xpath("div[1]/a/time/@datetime").extract(),
                "organization": event.xpath("div[2]/div/div[1]/a/span[@itemprop='name']/text()").extract(),
                "num_attending": event.xpath("div[2]/div/div[2]/div[@class='attendee-count']/text()").extract_first()
              
            }

process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'Meetups22.json',  # Name our storage file.
    'LOG_ENABLED': False          
})

# Start the crawler with our spider.
process.crawl(MUSpider)
process.start()
print('Success!')


In [None]:
import pandas as pd

firstpage = pd.read_json('Meetups22.json', orient='records')
print(firstpage.shape)
firstpage.head()

In [None]:
def pull_numbers(string):
    for s in string.split():
        if s.isdigit():
            int(s)
            return(s)

In [None]:
firstpage["num_attending"] = firstpage["num_attending"].apply(pull_numbers)

In [None]:
firstpage.head(10)


In [None]:
firstpage["meetup_title"].iloc[1]

In [None]:
def list_to_string(lst):
    return lst[0]

In [None]:
firstpage["meetup_title"] = firstpage["meetup_title"].apply(list_to_string)

In [None]:
firstpage.iloc[65:]

In [None]:
firstpage["num_attending"] = firstpage["num_attending"].apply(int)

In [None]:
firstpage.dtypes

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import statistics

plt.figure(figsize=(10,10))
plt.hist(firstpage["num_attending"], color="purple", bins=20)
plt.title("Number of People RSVPed for Meetup Event")
plt.ylabel("Number of Meetup Events")
plt.xlabel("Number of RSVPs")
plt.axvline(statistics.mean(firstpage["num_attending"]))

In [None]:
firstpage.head(10)

In [None]:
firstpage["Women"] = 0

In [None]:
def word_woman(string):
    if "Women" in string:
        return firstpage["Women"] == 1
    else:
        return firstpage["Women"] == 0

In [None]:
firstpage["Women"] = firstpage["meetup_title"].apply(word_woman)

In [None]:
def word_denver(string):
    if "Denver" in string:
        return firstpage["Denver"] == 1
    else:
        return firstpage["Denver"] == 0
    
def word_code(string):
    if "Code" in string:
        return firstpage["Code"] == 1
    else:
        return firstpage["Code"] == 0

def word_hh(string):
    if "Happy Hour" in string:
        return firstpage["Happy_Hour"] == 1
    else:
        return firstpage["Happy_Hour"] == 0
    
def word_meditation(string):
    if "Meditation" in string:
        return firstpage["Meditation"] == 1
    else:
        return firstpage["Meditation"] == 0
    
def word_networking(string):
    if "Networking" in string:
        return firstpage["Networking"] == 1
    else:
        return firstpage["Networking"] == 0

In [None]:
firstpage["Denver"] = 0
firstpage["Denver"] = firstpage["meetup_title"].apply(word_denver)

firstpage["Code"] = 0
firstpage["Code"] = firstpage["meetup_title"].apply(word_code)

firstpage["Happy_Hour"] = 0
firstpage["Happy_Hour"] = firstpage["meetup_title"].apply(word_hh)

firstpage["Meditation"] = 0
firstpage["Meditation"] = firstpage["meetup_title"].apply(word_meditation)

firstpage["Networking"] = 0
firstpage["Networking"] = firstpage["meetup_title"].apply(word_networking)

In [None]:
firstpage.head(10)

In [None]:
q = {False: 1, True: 0}

words = ["Women", "Denver", "Code", "Happy_Hour", "Meditation", "Networking"]

for word in words:
    firstpage[word] = firstpage[word].map(q)

In [None]:
firstpage.head()

In [None]:
plt.figure(figsize=(10,10))
for col in firstpage.columns[4:]:
    plt.bar(x=col, height=sum(firstpage[col]))
plt.title("Most Frequent Words In Event Title", size=15)
plt.ylabel("Frequency in Event Titles", size=13)

In [None]:
def hot_words(df):
    i = 0
    if df["Women"] == 1:
        i += 1
    if df["Denver"] == 1:
        i += 1
    if df["Code"] == 1:
        i += 1
    if df["Happy_Hour"] == 1:
        i += 1
    if df["Meditation"] == 1:
        i += 1
    if df["Networking"] == 1:
        i += 1
    return i

In [None]:
firstpage["Hot_Words"] = firstpage.apply(hot_words, axis=1)

In [None]:
firstpage.head()

In [None]:
firstpage.describe()

In [None]:
averages_by_num_of_hot_words = []

for n in [0, 1, 2, 3]:
    avg = sum(firstpage[firstpage["Hot_Words"] == n]["num_attending"])/ (firstpage[firstpage["Hot_Words"] == n].shape[0] + .0001)
    averages_by_num_of_hot_words.append(avg)

In [None]:
averages_by_num_of_hot_words

In [None]:
plt.figure(figsize=(10,8))
plt.bar(x=[0, 1, 2, 3], height=averages_by_num_of_hot_words)
plt.xticks([0, 1, 2, 3])
plt.title("Average Attendance By Number of Hot Words", size=20)
plt.xlabel("Number of Hot Words", size=15)
plt.ylabel("Average Attendance", size=15)

In [None]:
n = 10
for col in firstpage.columns[4:10]:

    if sum(firstpage[col]) == 0:
        del firstpage[col]
        n -=1

In [None]:
n

In [None]:
with_hot_word_attendance = []
without_hot_word_attendance = []

for col in firstpage.columns[4:n]:
    with_avg = statistics.mean(firstpage[firstpage[col] == 1]["num_attending"]) 
    without_avg = statistics.mean(firstpage[firstpage[col] == 0]["num_attending"])
    with_hot_word_attendance.append(with_avg)
    without_hot_word_attendance.append(without_avg)

In [None]:
firstpage.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15,10))

ind = np.arange(n-4)    # the x locations for the groups
width = 0.35         # the width of the bars
p1 = ax.bar(ind, with_hot_word_attendance, width, color='green')


p2 = ax.bar(ind + width, without_hot_word_attendance, width, color='yellow')

ax.set_title('Average Attendance By Presence/Absence of Hot Words')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels((firstpage.columns[4:n]), fontsize=15)
ax.legend((p1[0], p2[0]), ('With Hot Word', 'Without Hot Word'), fontsize=20)
plt.show()

# Example 2: Middle East Forum

In [None]:
# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess


class MEFSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "MEF"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.meforum.org',
    ]

    # What to do with the URL.  Here, we tell it to download all the code and save
    # it to the mainpage.html file
    def parse(self, response):
        with open('mainpage.html', 'wb') as f:
            f.write(response.body)


# Instantiate our crawler.
process = CrawlerProcess()

# Start the crawler with our spider.
process.crawl(MEFSpider)
process.start()

## Step 2
### Note:
I should now have a file called 'mainpage.html' saved to your machine that contains all the code from

In [None]:
# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess


class MEFSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "MEF"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.meforum.org/',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for article in response.xpath('//article'):
            
            # Yield a dictionary with the values we want.
            yield {
                # This is the code to choose what we want to extract
                # You can modify this with other Xpath expressions to extract other information from the site
                'name': article.xpath('header/h2/a/@title').extract_first(),
                'date': article.xpath('header/section/span[@class="entry-date"]/text()').extract_first(),
                'text': article.xpath('section[@class="entry-content"]/p/text()').extract(),
                'tags': article.xpath('*/span[@class="tag-links"]/a/text()').extract()
            }

# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'firstpage.json',  # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(MEFSpider)
process.start()
print('Success!')


In [None]:
import scrapy
import scrapy.crawler as crawler
from multiprocessing import Process, Queue
from twisted.internet import reactor

# your spider
class MEFSpider(scrapy.Spider):
    name = "MEF"
    start_urls = ['https://www.meforum.org/']

    def parse(self, response):
        for quote in response.css('div.quote'):
            print(quote.css('span.text::text').extract_first())


# the wrapper to make it run more times
def run_spider(spider):
    def f(q):
        try:
            runner = crawler.CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess


class MEFSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "MEF"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.meforum.org/',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for article in response.xpath('//article'):
            
            # Yield a dictionary with the values we want.
            yield {
                # This is the code to choose what we want to extract
                # You can modify this with other Xpath expressions to extract other information from the site
                'name': article.xpath('header/h2/a/@title').extract_first(),
                'date': article.xpath('header/section/span[@class="entry-date"]/text()').extract_first(),
                'text': article.xpath('section[@class="entry-content"]/p/text()').extract(),
                'tags': article.xpath('*/span[@class="tag-links"]/a/text()').extract()
            }

# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'firstpage.json',  # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(MEFSpider)
process.start()
print('Success!')


In [None]:
import pandas as pd

# Checking whether we got data from all 9 pages
MEFdf=pd.read_json('data.json', orient='records')
print(MEFdf.shape)
print(MEFdf.head())