In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re


"""
feature categories:
1. PLANETS/POINTS in the SIGNS (e.g. Sun in Aries)
    - there is usually 1 (but up to 2) feature/s per planet/point. e.g. sun may be close 
    to 2 different signs, so that the birth chart indicates the planet/point being in 2 
    signs
    
2. PLANETS/POINTS in the HOUSES (e.g. Moon in 1st House)
    - there are 1 or 2 features per planet/point.(Same explanation as above).

3. SIGNS on the HOUSE CUSPS (e.g. Aries on 1st House Cusp)

4. ASPECTS between two planets/points(for now the website seems to only have sun-related)
    - e.g. Sun conjunct Mercury

5. DOMINANCE/WEAKNESS OF ELEMENTS
    - e.g. Dominance of FIRE, Weakness of Water

6. DOMINANCE/WEAKNESS OF QUALITIES
    - 3 QUALITIES: Mutable, Fixed, Cardinal
    - same as for elements: the feature is based on a DOMINANCE or WEAKNESS of the QUALITY

each feature is potentially associated with a description/interpretation of the person who
bears this feature.

BIG starting question: how do I structure the corpus?
Maybe?:
FEATURE_CAT | FEATURE | DESC
===============================
int         | Str     | Str

So, we start w a given birth chart (generated by inputting birth info to the website),
then from the birth chart we extract features. then we select rows based on the features.
and compare the various descriptions for consistency/conflict.

"""
x = None
cat = 'planets-in-signs'
feature = 'sun'  
# ascendant, sun, mercury, moon, venus, mars, jupiter, saturn, uranus,
# neptune, pluto, black moon lilith, part-of-fortune.

class AstroSpider(scrapy.Spider):
    name = 'astro_spider'
    #start_urls = ['https://astrolibrary.org/interpretations/']
    #start_urls = ['https://astrolibrary.org/interpretations/category/planets-in-signs/']
    #start_urls = ['https://astrolibrary.org/interpretations/sun/']
    start_urls = [f'https://astrolibrary.org/interpretations/{feature}/']

    desc_links = []
    data = []

    # find category links from /interpretations
    def parse2(self, response):
        INTERPS_SELECTOR = '.interps-list-main'
        LINK_SELECTOR = 'a ::attr(href)'
        for interp in response.css(INTERPS_SELECTOR):
            next_page = interp.css(LINK_SELECTOR).extract_first()
            self.main_links.append(next_page)
            if next_page:
                yield scrapy.Request(
                    response.urljoin(next_page),
                    callback=self.page2
                )
                print(next_page)

    # first find links to descriptions from /category/planets-in-signs
    def parse1(self, response):
        LIST_SELECTOR = '.listnice'
        LI_SELECTOR = 'li'
        LINK_SELECTOR = 'a ::attr(href)'

        for nice in response.css(LIST_SELECTOR):
            for li in nice.css(LI_SELECTOR):
                next_page = li.css(LINK_SELECTOR).extract_first()

                if next_page:
                    # todo: exclude links that have the substring 'category'
                    # we want links that bring us to the page of descriptions on a pair
                    # of features.
                    self.desc_links.append(next_page)
                    yield scrapy.Request(
                        response.urljoin(next_page),
                        callback=self.nothing
                    )
                    print(next_page)

    def parse(self, response):
        # find <div id="ris">
        # split descriptions with <h2> tag
        # for each <h2> tag, extract feature name.
        # use each <h2> tag, extract descriptions from <p> tags, making sure to remove
        # google ads.
        global x
        x = response.xpath('//div[@id="ris"]')
        print(x)
        
        # I've gotten the all the h2 and p tags in this long string. But i need to split
        # it up.

    def parse_pof(self, response):
        # use this method if the preceding URL contained "part-of-fortune". This is due to
        # the fact that the descriptions for part-of-fortune are organized and formatted
        # differently.
        pass

    def nothing(self, response):
        pass


def main():
    process = CrawlerProcess(settings={
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'items.csv'
    })

    process.crawl(AstroSpider)
    process.start()  # the script will block here until the crawling is finished
    # print(AstroSpider.data)
    # todo: remove final links that have the substring 'category'


if __name__ == '__main__':
    main()

2019-11-18 17:50:29 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-11-18 17:50:29 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.7.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o  27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.17763-SP0
2019-11-18 17:50:29 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'csv', 'FEED_URI': 'items.csv'}
2019-11-18 17:50:29 [scrapy.extensions.telnet] INFO: Telnet Password: a17f95ca1fa00ecf
2019-11-18 17:50:30 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2019-11-18 17:50:30 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 '

[<Selector xpath='//div[@id="ris"]' data='<div id="ris">\n<h2 id="aries" class="...'>]


In [2]:
y = x.xpath('//p | //h2')

In [3]:
y

[<Selector xpath='//p | //h2' data='<p>Below are the interpretations of t...'>,
 <Selector xpath='//p | //h2' data='<h2 id="aries" class="c1">Sun in Arie...'>,
 <Selector xpath='//p | //h2' data='<p>Aries sun natives can be inspirati...'>,
 <Selector xpath='//p | //h2' data='<p>Because Aries natives can be prett...'>,
 <Selector xpath='//p | //h2' data='<p>Aries natives are very straight-fo...'>,
 <Selector xpath='//p | //h2' data='<p>Aries natives are prone to headach...'>,
 <Selector xpath='//p | //h2' data='<p>Four things an Arian should learn ...'>,
 <Selector xpath='//p | //h2' data='<p><a href="https://astrolibrary.org/...'>,
 <Selector xpath='//p | //h2' data='<h2 id="taurus" class="c2">Sun in Tau...'>,
 <Selector xpath='//p | //h2' data='<p>Taurus natives are generally stron...'>,
 <Selector xpath='//p | //h2' data='<p>Possessions and material things ar...'>,
 <Selector xpath='//p | //h2' data='<p>Taurus people work at a slower pac...'>,
 <Selector xpath='//p | //h2' data='<p>I

In [4]:
#header_flags = [0 for i in range(len(y))]

def flag_h2(selected_arr):
    '''
    marks indices in selected_arr that contain h2 tag.
    returns a list where the element at index i is 1, if element at same index in selected_arr
    contains h2 tag.
    '''

    header_flags = [0 for i in range(len(y))]
    for i in range(len(selected_arr)):
        data = selected_arr[i].get()
        #print(data)
        tmp = re.findall('<h2+', data)
        if len(tmp) > 0:
            #print(tmp)
            header_flags[i] = 1
    return header_flags

header_flags = flag_h2(y)
print(header_flags)

[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [5]:
def make_data_dict(header_flags, y):
    """
    Returns dict. of features and associated descriptions by parsing y with info from
    header_flags.
    
    Parameters:
    header_flags: list of flags (0/1) indicating which index has h2 header tags. See flag_h2
    function for more details.
    y:list of Scrapy Selectors.
    
    Returns: dict. of features and associated descriptions.
    """
    tmp_dict = dict()
    value = []
    key = None
    for i in range(len(y)):
        if header_flags[i] == 1 or i == len(y) - 1:
            if key is not None:
                tmp_dict[key] = value
            value = []
            key = y[i].xpath('text()').extract_first()
        else:
            desc = y[i].xpath('text()').extract_first()
            if desc is not None:
                value.append(desc)

    keys_list = list(tmp_dict.keys())
    print(keys_list)
    return tmp_dict

In [6]:
tmp_dict = make_data_dict(header_flags, y)
keys_list = list(tmp_dict.keys())
print(keys_list)

['Sun in Aries', 'Sun in Taurus', 'Sun in Gemini', 'Sun in Cancer', 'Sun in Leo', 'Sun in Virgo', 'Sun in Libra', 'Sun in Scorpio', 'Sun in Sagittarius', 'Sun in Capricorn', 'Sun in Aquarius', 'Sun in Pisces']
['Sun in Aries', 'Sun in Taurus', 'Sun in Gemini', 'Sun in Cancer', 'Sun in Leo', 'Sun in Virgo', 'Sun in Libra', 'Sun in Scorpio', 'Sun in Sagittarius', 'Sun in Capricorn', 'Sun in Aquarius', 'Sun in Pisces']


In [7]:
tmp_dict[keys_list[0]]

['Aries sun natives can be inspirational, courageous, enthusiastic, original, independent, impatient, aggressive, headstrong, selfish, self-centered, and impulsive. The Aries person’s energies are directed toward building a new individuality, thus all the Aries’ energies are directed towards themselves and what they want. An Aries likes roles where their leadership abilities are put on display. Aries natives have executive and organizing ability which is mainly directed in starting things. Sustaining projects is not their strength, but initiating projects is. The tendency to "damn the torpedoes" and ram full-speed ahead must be controlled. Aries natives are capable of great accomplishments if they learn how to constructively use their abundant energies. Aries natives are naturally enthusiastic and are always ready for activity and competition. They are inspiring to others because of these tendencies. Ideas and creative projects seem to flow from them in a never-ending stream. They are 

In [8]:
import json

with open(f'{cat}_{feature}_data.json', 'w') as fp:
    json.dump(tmp_dict, fp)