# Load libraries and extensions

In [3]:
%load_ext rpy2.ipython

In [1]:
import pandas as pd
import re

# Scrape Medium articles

## Articles published 2009-2011

Done - May 11

In [2]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    nameOfAuthor = scrapy.Field()
    linkOfAuthorProfile = scrapy.Field()
    NumOfComments = scrapy.Field()
    article = scrapy.Field()
    postingTime = scrapy.Field()
    NumOfClaps = scrapy.Field()
    articleURL = scrapy.Field()
    articleTags = scrapy.Field()
    readingTime = scrapy.Field()


logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2009-2011_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2009, 2012):
            urls.append(f"https://medium.com/tag/data-science/archive/{year}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2009-2011.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        item = Article()

        for story in response.css('div.postArticle'):
            if story.css('div.postArticle-readMore a::attr(href)').extract_first() is not None:
                url = story.css('div.postArticle-readMore a::attr(href)').extract_first()
                yield scrapy.Request(url=url, callback=self.parse_tags, meta={'item': item})

    def parse_tags(self, response):

        item = response.meta['item']
        item['articleURL'] = response.request.url
        # item['article'] = response.css('div.postArticle-content section div.section-content div h1::text, \
        #                                 div.postArticle-content section div.section-content div h4::text, \
        #                                 div.postArticle-content section div.section-content div h1 a::text, \
        #                                 div.postArticle-content section div.section-content div h1 strong::text,\
        #                                 div.postArticle-content section div.section-content div h1 em::text, \
        #                                 div.postArticle-content section div.section-content div p::text, \
        #                                 div.postArticle-content section div.section-content div h1::text').extract_first()


        item['article'] = response.xpath('//div/main/article/div/section/div[2]/div/h1//text() | //div/main/article/div/section/div[2]/div/h4//text() | //div/main/article/div/section/div[2]/div/h1/a//text() | //div/main/article/div/section/div[2]/div/h1/strong//text() | //div/main/article/div/section/div[2]/div/h1/em//text() | //div/main/article/div/section/div[2]/div/p//text()').extract_first()




        try:
            item['linkOfAuthorProfile'] = response.css('div.u-paddingBottom3 a').attrib['href']
        except KeyError:
            item['linkOfAuthorProfile'] = ' '

        try:
            item['readingTime'] = response.css('span.readingTime').attrib['title']
        except KeyError:
            item['readingTime'] = ' '


        item['nameOfAuthor'] = response.css('div.u-paddingBottom3 a::text').extract_first()
        item['postingTime'] = response.css('time::text').extract_first()
        item['articleTags'] = response.css('div.u-paddingBottom10 ul.tags--postTags li a::text').getall()
        item['NumOfComments'] = response.css(
            'div.buttonSet.u-flex0 button.button.button--chromeless.u-baseColor--buttonNormal.u-marginRight12::text').extract_first()
        item['NumOfClaps'] = response.xpath(
            '//div/main/article/footer/div[1]/div[3]/div/div[1]/div/span/button//text()').extract_first()


        yield item


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()


2019-05-11 22:14:17 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-05-11 22:14:17 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.1 (default, Dec 14 2018, 13:28:58) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.4.2, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-05-11 22:14:17 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_MAX_DELAY': 3, 'AUTOTHROTTLE_START_DELAY': 1, 'FEED_FORMAT': 'csv', 'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2009-2011.csv', 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2019-05-11 22:14:17 [scrapy.extensions.telnet] INFO: Telnet Password: 0126fa10063e0868
2019-05-11 22:14:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetCo

## Articles published 2012-2013

Done May 11

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    nameOfAuthor = scrapy.Field()
    linkOfAuthorProfile = scrapy.Field()
    NumOfComments = scrapy.Field()
    article = scrapy.Field()
    postingTime = scrapy.Field()
    NumOfClaps = scrapy.Field()
    articleURL = scrapy.Field()
    articleTags = scrapy.Field()
    readingTime = scrapy.Field()


logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2012-2013_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2012, 2014):
            for month in range(1, 13):
                urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2012-2013.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        item = Article()

        for story in response.css('div.postArticle'):
            if story.css('div.postArticle-readMore a::attr(href)').extract_first() is not None:
                url = story.css('div.postArticle-readMore a::attr(href)').extract_first()
                yield scrapy.Request(url=url, callback=self.parse_tags, meta={'item': item})

    def parse_tags(self, response):

        item = response.meta['item']
        item['articleURL'] = response.request.url
        # item['article'] = response.css('div.postArticle-content section div.section-content div h1::text, \
        #                                 div.postArticle-content section div.section-content div h4::text, \
        #                                 div.postArticle-content section div.section-content div h1 a::text, \
        #                                 div.postArticle-content section div.section-content div h1 strong::text,\
        #                                 div.postArticle-content section div.section-content div h1 em::text, \
        #                                 div.postArticle-content section div.section-content div p::text, \
        #                                 div.postArticle-content section div.section-content div h1::text').extract_first()


        item['article'] = response.xpath('//div/main/article/div/section/div[2]/div/h1//text() | //div/main/article/div/section/div[2]/div/h4//text() | //div/main/article/div/section/div[2]/div/h1/a//text() | //div/main/article/div/section/div[2]/div/h1/strong//text() | //div/main/article/div/section/div[2]/div/h1/em//text() | //div/main/article/div/section/div[2]/div/p//text()').extract_first()




        try:
            item['linkOfAuthorProfile'] = response.css('div.u-paddingBottom3 a').attrib['href']
        except KeyError:
            item['linkOfAuthorProfile'] = ' '

        try:
            item['readingTime'] = response.css('span.readingTime').attrib['title']
        except KeyError:
            item['readingTime'] = ' '


        item['nameOfAuthor'] = response.css('div.u-paddingBottom3 a::text').extract_first()
        item['postingTime'] = response.css('time::text').extract_first()
        item['articleTags'] = response.css('div.u-paddingBottom10 ul.tags--postTags li a::text').getall()
        item['NumOfComments'] = response.css(
            'div.buttonSet.u-flex0 button.button.button--chromeless.u-baseColor--buttonNormal.u-marginRight12::text').extract_first()
        item['NumOfClaps'] = response.xpath(
            '//div/main/article/footer/div[1]/div[3]/div/div[1]/div/span/button//text()').extract_first()


        yield item


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()


2019-05-11 22:19:19 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-05-11 22:19:19 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.1 (default, Dec 14 2018, 13:28:58) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.4.2, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-05-11 22:19:19 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_MAX_DELAY': 3, 'AUTOTHROTTLE_START_DELAY': 1, 'FEED_FORMAT': 'csv', 'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2012-2013.csv', 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2019-05-11 22:19:19 [scrapy.extensions.telnet] INFO: Telnet Password: 1147e88e07765b15
2019-05-11 22:19:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetCo

## Articles published 2014-2015

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    nameOfAuthor = scrapy.Field()
    linkOfAuthorProfile = scrapy.Field()
    NumOfComments = scrapy.Field()
    article = scrapy.Field()
    postingTime = scrapy.Field()
    NumOfClaps = scrapy.Field()
    articleURL = scrapy.Field()
    articleTags = scrapy.Field()
    readingTime = scrapy.Field()


logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-2015_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2014, 2016):
            for month in range(1, 13):
                for day in range(1, 32):
                    urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}/{day:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-2015.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        item = Article()

        for story in response.css('div.postArticle'):
            if story.css('div.postArticle-readMore a::attr(href)').extract_first() is not None:
                url = story.css('div.postArticle-readMore a::attr(href)').extract_first()
                yield scrapy.Request(url=url, callback=self.parse_tags, meta={'item': item})

    def parse_tags(self, response):

        item = response.meta['item']
        item['articleURL'] = response.request.url
        # item['article'] = response.css('div.postArticle-content section div.section-content div h1::text, \
        #                                 div.postArticle-content section div.section-content div h4::text, \
        #                                 div.postArticle-content section div.section-content div h1 a::text, \
        #                                 div.postArticle-content section div.section-content div h1 strong::text,\
        #                                 div.postArticle-content section div.section-content div h1 em::text, \
        #                                 div.postArticle-content section div.section-content div p::text, \
        #                                 div.postArticle-content section div.section-content div h1::text').extract_first()


        item['article'] = response.xpath('//div/main/article/div/section/div[2]/div/h1//text() | //div/main/article/div/section/div[2]/div/h4//text() | //div/main/article/div/section/div[2]/div/h1/a//text() | //div/main/article/div/section/div[2]/div/h1/strong//text() | //div/main/article/div/section/div[2]/div/h1/em//text() | //div/main/article/div/section/div[2]/div/p//text()').extract_first()




        try:
            item['linkOfAuthorProfile'] = response.css('div.u-paddingBottom3 a').attrib['href']
        except KeyError:
            item['linkOfAuthorProfile'] = ' '

        try:
            item['readingTime'] = response.css('span.readingTime').attrib['title']
        except KeyError:
            item['readingTime'] = ' '


        item['nameOfAuthor'] = response.css('div.u-paddingBottom3 a::text').extract_first()
        item['postingTime'] = response.css('time::text').extract_first()
        item['articleTags'] = response.css('div.u-paddingBottom10 ul.tags--postTags li a::text').getall()
        item['NumOfComments'] = response.css(
            'div.buttonSet.u-flex0 button.button.button--chromeless.u-baseColor--buttonNormal.u-marginRight12::text').extract_first()
        item['NumOfClaps'] = response.xpath(
            '//div/main/article/footer/div[1]/div[3]/div/div[1]/div/span/button//text()').extract_first()


        yield item


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()


2019-05-11 22:26:20 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-05-11 22:26:20 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.1 (default, Dec 14 2018, 13:28:58) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.4.2, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-05-11 22:26:20 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_MAX_DELAY': 3, 'AUTOTHROTTLE_START_DELAY': 1, 'FEED_FORMAT': 'csv', 'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-on.csv', 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2019-05-11 22:26:20 [scrapy.extensions.telnet] INFO: Telnet Password: 63114da82e581390
2019-05-11 22:26:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetCons

## Articles published 2016-2017

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    nameOfAuthor = scrapy.Field()
    linkOfAuthorProfile = scrapy.Field()
    NumOfComments = scrapy.Field()
    article = scrapy.Field()
    postingTime = scrapy.Field()
    NumOfClaps = scrapy.Field()
    articleURL = scrapy.Field()
    articleTags = scrapy.Field()
    readingTime = scrapy.Field()


logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2016-2017_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2016, 2018):
            for month in range(1, 13):
                for day in range(1, 32):
                    urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}/{day:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2016-2017.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        item = Article()

        for story in response.css('div.postArticle'):
            if story.css('div.postArticle-readMore a::attr(href)').extract_first() is not None:
                url = story.css('div.postArticle-readMore a::attr(href)').extract_first()
                yield scrapy.Request(url=url, callback=self.parse_tags, meta={'item': item})

    def parse_tags(self, response):

        item = response.meta['item']
        item['articleURL'] = response.request.url
#         item['article'] = response.css('div.postArticle-content section div.section-content div h1::text, \
#                                         div.postArticle-content section div.section-content div h4::text, \
#                                         div.postArticle-content section div.section-content div h1 a::text, \
#                                         div.postArticle-content section div.section-content div h1 strong::text,\
#                                         div.postArticle-content section div.section-content div h1 em::text, \
#                                         div.postArticle-content section div.section-content div p::text, \
#                                         div.postArticle-content section div.section-content div h1::text').extract_first()


        item['article'] = response.xpath('//div/main/article/div/section/div[2]/div/h1//text() | //div/main/article/div/section/div[2]/div/h4//text() | //div/main/article/div/section/div[2]/div/h1/a//text() | //div/main/article/div/section/div[2]/div/h1/strong//text() | //div/main/article/div/section/div[2]/div/h1/em//text() | //div/main/article/div/section/div[2]/div/p//text()').extract_first()




        try:
            item['linkOfAuthorProfile'] = response.css('div.u-paddingBottom3 a').attrib['href']
        except KeyError:
            item['linkOfAuthorProfile'] = ' '

        try:
            item['readingTime'] = response.css('span.readingTime').attrib['title']
        except KeyError:
            item['readingTime'] = ' '


        item['nameOfAuthor'] = response.css('div.u-paddingBottom3 a::text').extract_first()
        item['postingTime'] = response.css('time::text').extract_first()
        item['articleTags'] = response.css('div.u-paddingBottom10 ul.tags--postTags li a::text').getall()
        item['NumOfComments'] = response.css(
            'div.buttonSet.u-flex0 button.button.button--chromeless.u-baseColor--buttonNormal.u-marginRight12::text').extract_first()
        item['NumOfClaps'] = response.xpath(
            '//div/main/article/footer/div[1]/div[3]/div/div[1]/div/span/button//text()').extract_first()


        yield item


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()


## Articles published 2018-onwards

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    nameOfAuthor = scrapy.Field()
    linkOfAuthorProfile = scrapy.Field()
    NumOfComments = scrapy.Field()
    article = scrapy.Field()
    postingTime = scrapy.Field()
    NumOfClaps = scrapy.Field()
    articleURL = scrapy.Field()
    articleTags = scrapy.Field()
    readingTime = scrapy.Field()


logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-on_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2018, 2020):
            for month in range(1, 13):
                for day in range(1, 32):
                    urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}/{day:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-on.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        item = Article()

        for story in response.css('div.postArticle'):
            if story.css('div.postArticle-readMore a::attr(href)').extract_first() is not None:
                url = story.css('div.postArticle-readMore a::attr(href)').extract_first()
                yield scrapy.Request(url=url, callback=self.parse_tags, meta={'item': item})

    def parse_tags(self, response):

        item = response.meta['item']
        item['articleURL'] = response.request.url
        # item['article'] = response.css('div.postArticle-content section div.section-content div h1::text, \
        #                                 div.postArticle-content section div.section-content div h4::text, \
        #                                 div.postArticle-content section div.section-content div h1 a::text, \
        #                                 div.postArticle-content section div.section-content div h1 strong::text,\
        #                                 div.postArticle-content section div.section-content div h1 em::text, \
        #                                 div.postArticle-content section div.section-content div p::text, \
        #                                 div.postArticle-content section div.section-content div h1::text').extract_first()


        item['article'] = response.xpath('//div/main/article/div/section/div[2]/div/h1//text() | //div/main/article/div/section/div[2]/div/h4//text() | //div/main/article/div/section/div[2]/div/h1/a//text() | //div/main/article/div/section/div[2]/div/h1/strong//text() | //div/main/article/div/section/div[2]/div/h1/em//text() | //div/main/article/div/section/div[2]/div/p//text()').extract_first()




        try:
            item['linkOfAuthorProfile'] = response.css('div.u-paddingBottom3 a').attrib['href']
        except KeyError:
            item['linkOfAuthorProfile'] = ' '

        try:
            item['readingTime'] = response.css('span.readingTime').attrib['title']
        except KeyError:
            item['readingTime'] = ' '


        item['nameOfAuthor'] = response.css('div.u-paddingBottom3 a::text').extract_first()
        item['postingTime'] = response.css('time::text').extract_first()
        item['articleTags'] = response.css('div.u-paddingBottom10 ul.tags--postTags li a::text').getall()
        item['NumOfComments'] = response.css(
            'div.buttonSet.u-flex0 button.button.button--chromeless.u-baseColor--buttonNormal.u-marginRight12::text').extract_first()
        item['NumOfClaps'] = response.xpath(
            '//div/main/article/footer/div[1]/div[3]/div/div[1]/div/span/button//text()').extract_first()


        yield item


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()


# Scrape story cards pages for validation

## Articles published 2009-2011

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    article = scrapy.Field()
    articleURL = scrapy.Field()

logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_titles_2009-2011_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2009, 2012):
            urls.append(f"https://medium.com/tag/data-science/archive/{year}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_titles_2009-2011.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        for story in response.css('div.postArticle'):
                yield {
                    'article': story.css('div.postArticle-content section div.section-content div h3::text').extract_first(),
                    'articleURL': story.css('div.postArticle-readMore a::attr(href)').extract_first(),
                }
            
        

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()

2019-05-13 08:33:15 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-05-13 08:33:15 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.7.1 (default, Dec 14 2018, 13:28:58) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.4.2, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-05-13 08:33:15 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_MAX_DELAY': 3, 'AUTOTHROTTLE_START_DELAY': 1, 'FEED_FORMAT': 'csv', 'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_titles_2009-2011.csv', 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2019-05-13 08:33:15 [scrapy.extensions.telnet] INFO: Telnet Password: 21b419b77d684ed6
2019-05-13 08:33:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.T

## Articles published 2012-2013

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    article = scrapy.Field()
    articleURL = scrapy.Field()

logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_titles_2012-2013_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2012, 2014):
            for month in range(1, 13):
                urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_titles_2012-2013.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        for story in response.css('div.postArticle'):
                yield {
                    'article': story.css('div.postArticle-content section div.section-content div h3::text').extract_first(),
                    'articleURL': story.css('div.postArticle-readMore a::attr(href)').extract_first(),
                }
            
        
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()

## Articles published 2014-2015

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging


class Article(scrapy.Item):
    article = scrapy.Field()
    postingTime = scrapy.Field()
    articleURL = scrapy.Field()



logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-2015_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2014, 2016):
            for month in range(1, 13):
                for day in range(1, 32):
                    urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}/{day:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-2015.csv',
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_MAX_DELAY': 3
    }

    def parse(self, response):
        for story in response.css('div.postArticle'):
            yield {
                'article': story.css(
                    'div.postArticle-content section div.section-content div h3::text').extract_first(),
                'articleURL': story.css('div.postArticle-readMore a::attr(href)').extract_first(),
                'postingTime':story.css('time::text').extract_first(),
            }


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()

## Articles published 2016-2017

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging


class Article(scrapy.Item):
    article = scrapy.Field()
    postingTime = scrapy.Field()
    articleURL = scrapy.Field()



logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_title_2016-2017_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2016, 2018):
            for month in range(1, 13):
                for day in range(1, 32):
                    urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}/{day:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_title_2016-2017.csv',
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_MAX_DELAY': 3
    }

    def parse(self, response):
        for story in response.css('div.postArticle'):
            yield {
                'article': story.css(
                    'div.postArticle-content section div.section-content div h3::text').extract_first(),
                'articleURL': story.css('div.postArticle-readMore a::attr(href)').extract_first(),
                'postingTime':story.css('time::text').extract_first(),
            }


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(IntroSpider)
process.start()

## Articles published 2018-onwards

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
import logging

class Article(scrapy.Item):
    nameOfAuthor = scrapy.Field()
    linkOfAuthorProfile = scrapy.Field()
    NumOfComments = scrapy.Field()
    article = scrapy.Field()
    postingTime = scrapy.Field()
    NumOfClaps = scrapy.Field()
    articleURL = scrapy.Field()
    articleTags = scrapy.Field()
    readingTime = scrapy.Field()


logger = logging.getLogger('testlogger')


class IntroSpider(scrapy.Spider):
    name = "intro_spider"  # Name of the scraper

    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-on_log.txt',
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )

    def start_requests(self):
        urls = []

        for year in range(2018, 2020):
            for month in range(1, 13):
                for day in range(1, 32):
                    urls.append(f"https://medium.com/tag/data-science/archive/{year}/{month:02}/{day:02}")

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': '/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-on.csv',
        'AUTOTHROTTLE_ENABLED' : True,
        'AUTOTHROTTLE_START_DELAY' : 1,
        'AUTOTHROTTLE_MAX_DELAY' : 3
    }

    def parse(self, response):
        for story in response.css('div.postArticle'):
                yield {
                    'nameOfAuthor': story.css('div.u-marginBottom10 div div.postMetaInline-authorLockup a::text').extract_first(),
                    'linkOfAuthorProfile': story.css('div.u-marginBottom10 div div.postMetaInline-avatar a::attr(href)').extract_first(),
                    'article': story.css('div.postArticle-content section div.section-content div h3::text').extract_first(),
                    'articleLink': story.css('div.postArticle-readMore a::attr(href)').extract_first(),
                    'postingTime': story.css('div div.u-marginBottom10 div div.postMetaInline-authorLockup div a time::text').extract_first(),
                    'recommendation': story.css('div.u-paddingTop10 div.u-floatLeft div div button.u-disablePointerEvents::text').extract_first(),
                }

# Analyses

## NLP

In [177]:
import pandas as pd
full = pd.read_csv("/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_full_2016.csv")

full['articleLink'] = full['articleURL'].str.split('?').str[0]

full.drop('articleURL', axis=1, inplace=True)

full.drop_duplicates(subset=['article'], keep='first', inplace=True)

full.dropna(subset=['article'], inplace=True)

In [163]:
lexicon = pd.read_csv("/Users/Nancy/Downloads/NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt", sep='\t')

lexicon.head()

Unnamed: 0,Word,Valence,Arousal,Dominance
0,aaaaaaah,0.479,0.606,0.291
1,aaaah,0.52,0.636,0.282
2,aardvark,0.427,0.49,0.437
3,aback,0.385,0.407,0.288
4,abacus,0.51,0.276,0.485


In [164]:
import spacy
from spacy_langdetect import LanguageDetector

nlp = spacy.load('en_core_web_sm')

nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

test = []

for i in full['article']:
    print(i)
    doc = nlp(i)
    if doc._.language['language'] == 'en':
        test1 = []
        for token in doc:
            if token.is_stop==False and token.is_alpha:
                test1.append(token.lemma_.lower())
    test.append(test1)

Data Science and Adphorus
Data Mining, It Ain’t All That
Using Data Science to Better Predict Demand: Case from Twitter and the Flu Season
Heat Maps Track Customers In Store: A Chance to Apply Analytics for a Better Customer Experience
Data Science in Mexico
OHGB: Beating the Wheel of Fortune Bonus Round
So you want to be a data scientist?
Data Warehouse design on Amazon Redshift
Solving class imbalance on Google open images
Find and Watch TV Shows on YouTube with CouchTube.net
Lack of Analytics Skills May Be Holding Up Your Supply Chain
Data: The Next Frontier
So You Want to be a Kaggle Wizard
Insight Data Science Fellows Program launches New York session
Fellow Spotlight: Shelby Sturgis
Spending Your Analytics Time Where It Counts
Managers Need to Know Technical Details of Analytics
Predictions That Save Big Money
What about the women?
Creating, serving & storing data for discovery
Data Inference: Drinking With the Dude
Our Ongoing Analytics Project with the NFL: Winning More Games
B

In [165]:
from statistics import mean 

for title in test:
    valence = []
    arousal = []
    dominance = []
    for word in title:

        if not lexicon[lexicon['Word'] == word]['Valence'].empty:
            valence.append(lexicon.at[lexicon.index[lexicon['Word'] == word].to_list()[0], 'Valence'])
        else:
            pass
        
        if not lexicon[lexicon['Word'] == word]['Arousal'].empty:
            arousal.append(lexicon.at[lexicon.index[lexicon['Word'] == word].to_list()[0], 'Arousal'])
        else:
            pass
        
        if not lexicon[lexicon['Word'] == word]['Dominance'].empty:
            dominance.append(lexicon.at[lexicon.index[lexicon['Word'] == word].to_list()[0], 'Dominance'])
        else:
            pass
        
    if len(valence) > 0 and len(arousal) > 0 and len(dominance) > 0:
        print(title, mean(valence), mean(arousal), mean(dominance))
    else:
        pass

['data', 'science', 'adphorus'] 0.544 0.3395 0.5835
['data', 'mining', 'be'] 0.601 0.36266666666666664 0.5113333333333333
['data', 'science', 'better', 'predict', 'demand', 'case', 'twitter', 'flu', 'season'] 0.492 0.43857142857142856 0.5044285714285714
['heat', 'maps', 'track', 'customer', 'store', 'chance', 'apply', 'analytics', 'better', 'customer', 'experience'] 0.628 0.491625 0.601125
['heat', 'maps', 'track', 'customer', 'store', 'chance', 'apply', 'analytics', 'better', 'customer', 'experience'] 0.628 0.491625 0.601125
['ohgb', 'beat', 'wheel', 'fortune', 'bonus', 'round'] 0.6457999999999999 0.5484 0.5608
['want', 'data', 'scientist'] 0.5776666666666667 0.464 0.6046666666666667
['data', 'warehouse', 'design', 'amazon', 'redshift'] 0.6723333333333333 0.314 0.5013333333333333
['solve', 'class', 'imbalance', 'google', 'open', 'image'] 0.5399999999999999 0.4206 0.4992
['find', 'watch', 'tv', 'shows', 'youtube'] 0.6423333333333333 0.42700000000000005 0.4653333333333333
['lack', 'anal

In [178]:
from textblob import TextBlob
from collections import Counter

import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

def textblob_adj(text):
    blobed = TextBlob(text)
    counts = Counter(tag for word,tag in blobed.tags)
    adj_list = []
    adv_list = []
    adj_tag_list = ['JJ','JJR','JJS']
    adv_tag_list = ['RB','RBR','RBS']
    for (a, b) in blobed.tags:
        if b in adj_tag_list:
           adj_list.append(a)
        elif b in adv_tag_list:
           adv_list.append(a)
        else:
            pass
    return adj_list, adv_list, counts['JJ']+counts['JJR']+counts['JJS'], counts['RB']+counts['RBR']+counts['RBS']

for i in full['article']:
    print(textblob_adj(i))

(['s'], ['Here'], 1, 1)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
(['Hierarchical'], [], 1, 0)
(['Best'], [], 1, 0)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
(['real'], [], 1, 0)
(['Sought-After'], ['Most'], 1, 1)
(['ripe', 'CS-BS'], ['always', 'So', 'as', 'soon', 'enough'], 2, 5)
([], [], 0, 0)
(['all-day', 'm'], ['quite'], 2, 1)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
(['EEO-1'], [], 1, 0)
([], [], 0, 0)
(['머신러닝과'], [], 1, 0)
(['much'], [], 1, 0)
([], [], 0, 0)
(['dynamical'], [], 1, 0)
([], [], 0, 0)
(['Big'], [], 1, 0)
([], [], 0, 0)
(['important', 'important', 'many', 'big', 's', 'common'], ['enough', 'now', 'even'], 6, 3)
(['re'], [], 1, 0)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
([], [], 0, 0)
(['ile'], [], 1, 0)
([], [], 0, 0)
([], ['ve'], 0, 1)
(['unique'], [], 1, 0)
(['’'], ['Plotly'], 1, 1)
(['More', 'real-life'], [], 2, 0)
([], [], 0, 0)
(['Intuitive'], [], 1, 0)
(['open', 'open'], [], 2, 0)
(['Good

In [20]:
import pandas as pd
df = pd.read_csv('/Users/nancy/PycharmProjects/medium-ds-articles/data/raw/medium_scrapy_2014-2017.csv')

In [88]:
full.shape

(4720, 9)

In [21]:
df['postingTime'] = pd.to_datetime(df['postingTime'])

In [22]:
x = df[(df['article'].str.split().str.len() > 4) & (df['article'].str.split().str.len() < 15)]

In [9]:
from gensim import corpora
import gensim


dictionary = corpora.Dictionary(test)
corpus = [dictionary.doc2bow(text) for text in test]

NUM_TOPICS = 3

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

ldamodel.save('model3.gensim')

topics = ldamodel.print_topics(num_words=5)

for topic in topics:
    print(topic)
    
    
from __future__ import division

import pyLDAvis

pyLDAvis.enable_notebook()

lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

2019-05-13 21:52:59 [smart_open.smart_open_lib] DEBUG: {'uri': '/Users/nancy/miniconda3/lib/python3.7/site-packages/smart_open/VERSION', 'mode': 'r', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2019-05-13 21:53:00 [gensim.summarization.textcleaner] INFO: 'pattern' package not found; tag filters are not available for English
2019-05-13 21:53:00 [gensim.corpora.dictionary] INFO: adding document #0 to Dictionary(0 unique tokens: [])
2019-05-13 21:53:00 [gensim.corpora.dictionary] INFO: built Dictionary(2032 unique tokens: ['Analytics', 'Heart', 'Prescriptive', 'Logistic', 'Measuring']...) from 1030 documents (total 4887 corpus positions)


## Association rules

In [67]:
full['articleTags'] = full['articleTags']

full['articleTags'] = full['articleTags'].replace({',Data Science':',', ',Data Science,':',', 'Data Science,':','})

In [68]:
full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1030 entries, 0 to 1124
Data columns (total 9 columns):
NumOfClaps             622 non-null object
NumOfComments          206 non-null float64
article                1030 non-null object
articleTags            1030 non-null object
linkOfAuthorProfile    1030 non-null object
nameOfAuthor           1029 non-null object
postingTime            1029 non-null object
readingTime            1030 non-null object
articleLink            1030 non-null object
dtypes: float64(1), object(8)
memory usage: 120.5+ KB


In [29]:
y = x.str.split(',', expand=True)

y = y.fillna('')

In [71]:
y.replace('Data Science', '', inplace=True)
y.replace('Datascience', '', inplace=True)

In [72]:
y

Unnamed: 0,0,1,2,3,4
0,Analytics,,Health,,
1,Java,Apache,,Deep Learning,Engineering
2,,Statistics,,,
3,,Data Security,Computer Science,,
4,Insight Data Science,,,,
6,,Data Visualization,,,
9,Big Data,,White Papers,,
10,Machine Learning,Artificial Intelligence,Kosei,Pinterest,
11,,IoT,Energy,Machine Learning,Jobs
12,Data Visualization,Big Data,Creativity,Productivity,


In [73]:
y.to_csv('test_basket.csv', index=False)

In [74]:
%%R 

library(arules)

tr <- read.transactions("test_basket.csv", format = 'basket', sep=',')

In [75]:
%%R 

tr

transactions in sparse format with
 1031 transactions (rows) and
 662 items (columns)


In [76]:
%%R

summary(tr)

transactions as itemMatrix in sparse format with
 1031 rows (elements/itemsets/transactions) and
 662 columns (items) and a density of 0.00308415 

most frequent items:
          Big Data   Machine Learning          Analytics               Data 
               318                158                 78                 70 
Data Visualization            (Other) 
                63               1418 

element (itemset/transaction) length distribution:
sizes
  0   1   2   3   4   5 
 15 201 650  57 107   1 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   2.000   2.000   2.042   2.000   5.000 

includes extended item information - examples:
  labels
1      0
2      1
3      2


In [77]:
%%R

itms <- itemFrequency(tr, type = "relative")
data.frame(sort(itms, decreasing = TRUE))

                          sort.itms..decreasing...TRUE.
Big Data                                   0.3084384093
Machine Learning                           0.1532492726
Analytics                                  0.0756547042
Data                                       0.0678952473
Data Visualization                         0.0611057226
Statistics                                 0.0484966052
Startup                                    0.0368574200
Python                                     0.0320077595
Artificial Intelligence                    0.0223084384
Hadoop                                     0.0223084384
Tech                                       0.0223084384
Insight Data Science                       0.0203685742
Programming                                0.0203685742
Entrepreneurship                           0.0164888458
Marketing                                  0.0135790495
Data Analytics                             0.0126091174
Science                                    0.012

In [85]:
%%R

itemsets <- apriori(tr, parameter = list(supp=0.001, conf=0.8, target='frequent', minlen=3))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
         NA    0.1    1 none FALSE            TRUE       5   0.001      3
 maxlen            target   ext
     10 frequent itemsets FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 1 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[662 item(s), 1031 transaction(s)] done [0.00s].
sorting and recoding items ... [200 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
writing ... [20 set(s)] done [0.00s].
creating S4 object  ... done [0.00s].


In [86]:
%%R

quality(itemsets)$lift <- interestMeasure(itemsets, measure='lift', tr)

inspect(sort(itemsets, by ='count', decreasing = T))

     items                         support count         lift
[1]  {Software Engineering,                                  
      Wogrammer,                                             
      Women In Tech}           0.003879728     4 22145.020833
[2]  {Analytics,                                             
      Big Data,                                              
      Data Engineering}        0.003879728     4    17.141767
[3]  {Artificial Intelligence,                               
      Big Data,                                              
      Machine Learning}        0.003879728     4     3.679301
[4]  {Artificial Intelligence,                               
      Big Data,                                              
      Spark}                   0.002909796     3    72.666188
[5]  {Analytics,                                             
      Muppets,                                               
      Statistics}              0.001939864     2   272.554103
[6]  {Bi

In [15]:
%%R

rules <- rules[!is.redundant(rules)]

rules <- sort(rules, by='count', decreasing = TRUE)

inspect(rules[1:30])

R[write to console]: Error in withVisible({ : object 'rules' not found
Calls: <Anonymous> -> <Anonymous> -> withVisible

R[write to console]: In addition: 

R[write to console]: In asMethod(object) :
R[write to console]:  removing duplicated items in transactions




Error in withVisible({ : object 'rules' not found
Calls: <Anonymous> -> <Anonymous> -> withVisible


In [16]:
%%R

topRules <- rules[1:10]

library(arulesViz)

plot(topRules, method='graph', layout=igraph::in_circle())

R[write to console]: Error in withVisible({ : object 'rules' not found
Calls: <Anonymous> -> <Anonymous> -> withVisible




Error in withVisible({ : object 'rules' not found
Calls: <Anonymous> -> <Anonymous> -> withVisible
