In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re
import json

"""
feature categories:
1. PLANETS/POINTS in the SIGNS (e.g. Sun in Aries)
    - there is usually 1 (but up to 2) feature/s per planet/point. e.g. sun may be close 
    to 2 different signs, so that the birth chart indicates the planet/point being in 2 
    signs
    
2. PLANETS/POINTS in the HOUSES (e.g. Moon in 1st House)
    - there are 1 or 2 features per planet/point.(Same explanation as above).

3. SIGNS on the HOUSE CUSPS (e.g. Aries on 1st House Cusp)

4. ASPECTS between two planets/points(for now the website seems to only have sun-related)
    - e.g. Sun conjunct Mercury

5. DOMINANCE/WEAKNESS OF ELEMENTS
    - e.g. Dominance of FIRE, Weakness of Water

6. DOMINANCE/WEAKNESS OF QUALITIES
    - 3 QUALITIES: Mutable, Fixed, Cardinal
    - same as for elements: the feature is based on a DOMINANCE or WEAKNESS of the QUALITY

each feature is potentially associated with a description/interpretation of the person who
bears this feature.

BIG starting question: how do I structure the corpus?
Maybe?:
FEATURE_CAT | FEATURE | DESC
===============================
int         | Str     | Str

So, we start w a given birth chart (generated by inputting birth info to the website),
then from the birth chart we extract features. then we select rows based on the features.
and compare the various descriptions for consistency/conflict.

"""

def flag_h2(selected_arr):
    '''
    marks indices in selected_arr that contain h2 tag.
    returns a list where the element at index i is 1, if element at same index in selected_arr
    contains h2 tag.
    '''

    header_flags = [0 for i in range(len(selected_arr))]
    for i in range(len(selected_arr)):
        data = selected_arr[i].get()
        #print(data)
        tmp = re.findall('<h2+', data)
        if len(tmp) > 0:
            #print(tmp)
            header_flags[i] = 1
    return header_flags

def make_data_dict(header_flags, y):
    """
    Returns dict. of features and associated descriptions by parsing y with info from
    header_flags.

    Parameters:
    header_flags: list of flags (0/1) indicating which index has h2 header tags. See flag_h2
    function for more details.
    y:list of Scrapy Selectors.

    Returns: dict. of features and associated descriptions.
    """
    tmp_dict = dict()
    value = []
    key = None
    for i in range(len(y)):
        if header_flags[i] == 1 or i == len(y) - 1:
            if key is not None:
                tmp_dict[key] = value
            value = []
            key = y[i].xpath('text()').extract_first()
        else:
            desc = y[i].xpath('text()').extract_first()
            if desc is not None:
                value.append(desc)

    keys_list = list(tmp_dict.keys())
    print(keys_list)
    return tmp_dict

x = None
 
# ascendant, sun, mercury, moon, venus, mars, jupiter, saturn, uranus,
# neptune, pluto, black moon lilith, part-of-fortune.

class AstroSpider(scrapy.Spider):   
    name = 'astro_spider'
    #start_urls = ['https://astrolibrary.org/interpretations/']
    #start_urls = ['https://astrolibrary.org/interpretations/category/planets-in-signs/']
    #start_urls = ['https://astrolibrary.org/interpretations/sun/']
    
    start_urls = [f'https://astrolibrary.org/interpretations/ascendant/',\
                  f'https://astrolibrary.org/interpretations/sun/',\
                  f'https://astrolibrary.org/interpretations/mercury/',\
                  f'https://astrolibrary.org/interpretations/moon/',\
                  f'https://astrolibrary.org/interpretations/venus/',\
                  f'https://astrolibrary.org/interpretations/mars/',\
                  f'https://astrolibrary.org/interpretations/jupiter/',\
                  f'https://astrolibrary.org/interpretations/saturn/',\
                  f'https://astrolibrary.org/interpretations/uranus/',\
                  f'https://astrolibrary.org/interpretations/neptune/',\
                  f'https://astrolibrary.org/interpretations/pluto/',\
                  f'https://astrolibrary.org/interpretations/lilith/']

    def parse(self, response):
        # find <div id="ris">
        # split descriptions with <h2> tag
        # for each <h2> tag, extract feature name.
        # use each <h2> tag, extract descriptions from <p> tags, making sure to remove
        # google ads.
        url = response.url
        tmp = re.split('/', url)
        cat = 'planets-in-signs'
        feature = tmp[-2]
        
        global x
        x = response.xpath('//div[@id="ris"]')
        #print(x)
        y = x.xpath('//p | //h2')
        header_flags = flag_h2(y)
        #print(header_flags)
        tmp_dict = make_data_dict(header_flags, y)
        keys_list = list(tmp_dict.keys())
        print(keys_list)
        with open(f'{cat}_{feature}_data.json', 'w') as fp:
            json.dump(tmp_dict, fp)

    def do_nothing(self, response):
        pass

    
def main():
    process = CrawlerProcess(settings={
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'items.csv'
    })

    process.crawl(AstroSpider)
    process.start()  # the script will block here until the crawling is finished
    # print(AstroSpider.data)
    # todo: remove final links that have the substring 'category'


if __name__ == '__main__':
    main()

2019-11-18 18:45:08 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-11-18 18:45:08 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.7.0, Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o  27 Mar 2018), cryptography 2.2.2, Platform Windows-10-10.0.17763-SP0
2019-11-18 18:45:08 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'csv', 'FEED_URI': 'items.csv'}
2019-11-18 18:45:08 [scrapy.extensions.telnet] INFO: Telnet Password: 21d3aa16375706db
2019-11-18 18:45:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2019-11-18 18:45:09 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 '

['Jupiter in Aries', 'Jupiter in Taurus', 'Jupiter in Gemini', 'Jupiter in Cancer', 'Jupiter in Leo', 'Jupiter in Virgo', 'Jupiter in Libra', 'Jupiter in Scorpio', 'Jupiter in Sagittarius', 'Jupiter in Capricorn', 'Jupiter in Aquarius', 'Jupiter in Pisces']
['Jupiter in Aries', 'Jupiter in Taurus', 'Jupiter in Gemini', 'Jupiter in Cancer', 'Jupiter in Leo', 'Jupiter in Virgo', 'Jupiter in Libra', 'Jupiter in Scorpio', 'Jupiter in Sagittarius', 'Jupiter in Capricorn', 'Jupiter in Aquarius', 'Jupiter in Pisces']
['Moon in Aries', 'Moon in Taurus', 'Moon in Gemini', 'Moon in Cancer', 'Moon in Leo', 'Moon in Virgo', 'Moon in Libra', 'Moon in Scorpio', 'Moon in Sagittarius', 'Moon in Capricorn', 'Moon in Aquarius', 'Moon in Pisces']
['Moon in Aries', 'Moon in Taurus', 'Moon in Gemini', 'Moon in Cancer', 'Moon in Leo', 'Moon in Virgo', 'Moon in Libra', 'Moon in Scorpio', 'Moon in Sagittarius', 'Moon in Capricorn', 'Moon in Aquarius', 'Moon in Pisces']


2019-11-18 18:45:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://astrolibrary.org/interpretations/uranus/> (referer: None)
2019-11-18 18:45:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://astrolibrary.org/interpretations/neptune/> (referer: None)


['Sun in Aries', 'Sun in Taurus', 'Sun in Gemini', 'Sun in Cancer', 'Sun in Leo', 'Sun in Virgo', 'Sun in Libra', 'Sun in Scorpio', 'Sun in Sagittarius', 'Sun in Capricorn', 'Sun in Aquarius', 'Sun in Pisces']
['Sun in Aries', 'Sun in Taurus', 'Sun in Gemini', 'Sun in Cancer', 'Sun in Leo', 'Sun in Virgo', 'Sun in Libra', 'Sun in Scorpio', 'Sun in Sagittarius', 'Sun in Capricorn', 'Sun in Aquarius', 'Sun in Pisces']
['Mars in Aries', 'Mars in Taurus', 'Mars in Gemini', 'Mars in Cancer', 'Mars in Leo', 'Mars in Virgo', 'Mars in Libra', 'Mars in Scorpio', 'Mars in Sagittarius', 'Mars in Capricorn', 'Mars in Aquarius', 'Mars in Pisces']
['Mars in Aries', 'Mars in Taurus', 'Mars in Gemini', 'Mars in Cancer', 'Mars in Leo', 'Mars in Virgo', 'Mars in Libra', 'Mars in Scorpio', 'Mars in Sagittarius', 'Mars in Capricorn', 'Mars in Aquarius', 'Mars in Pisces']
['Venus in Aries', 'Venus in Taurus', 'Venus in Gemini', 'Venus in Cancer', 'Venus in Leo', 'Venus in Virgo', 'Venus in Libra', 'Venus i

2019-11-18 18:45:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://astrolibrary.org/interpretations/lilith/> (referer: None)


['Uranus in Aries', 'Uranus in Taurus', 'Uranus in Gemini', 'Uranus in Cancer', 'Uranus in Leo', 'Uranus in Virgo', 'Uranus in Libra', 'Uranus in Scorpio', 'Uranus in Sagittarius', 'Uranus in Capricorn', 'Uranus in Aquarius', 'Uranus in Pisces']
['Uranus in Aries', 'Uranus in Taurus', 'Uranus in Gemini', 'Uranus in Cancer', 'Uranus in Leo', 'Uranus in Virgo', 'Uranus in Libra', 'Uranus in Scorpio', 'Uranus in Sagittarius', 'Uranus in Capricorn', 'Uranus in Aquarius', 'Uranus in Pisces']
['Neptune in Cancer', 'Neptune in Leo', 'Neptune in Virgo', 'Neptune in Libra', 'Neptune in Scorpio', 'Neptune in Sagittarius', 'Neptune in Capricorn', 'Neptune in Aquarius']
['Neptune in Cancer', 'Neptune in Leo', 'Neptune in Virgo', 'Neptune in Libra', 'Neptune in Scorpio', 'Neptune in Sagittarius', 'Neptune in Capricorn', 'Neptune in Aquarius']
[]
[]


2019-11-18 18:45:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://astrolibrary.org/interpretations/pluto/> (referer: None)
2019-11-18 18:45:11 [scrapy.core.engine] INFO: Closing spider (finished)
2019-11-18 18:45:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2853,
 'downloader/request_count': 12,
 'downloader/request_method_count/GET': 12,
 'downloader/response_bytes': 190491,
 'downloader/response_count': 12,
 'downloader/response_status_count/200': 12,
 'elapsed_time_seconds': 2.093896,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2019, 11, 18, 10, 45, 11, 329285),
 'log_count/DEBUG': 12,
 'log_count/INFO': 10,
 'response_received_count': 12,
 'scheduler/dequeued': 12,
 'scheduler/dequeued/memory': 12,
 'scheduler/enqueued': 12,
 'scheduler/enqueued/memory': 12,
 'start_time': datetime.datetime(2019, 11, 18, 10, 45, 9, 235389)}
2019-11-18 18:45:11 [scrapy.core.engine] INFO: Spider closed (finished)


['Pluto in Gemini', 'Pluto in Cancer', 'Pluto in Leo', 'Pluto in Virgo', 'Pluto in Libra', 'Pluto in Scorpio', 'Pluto in Sagittarius']
['Pluto in Gemini', 'Pluto in Cancer', 'Pluto in Leo', 'Pluto in Virgo', 'Pluto in Libra', 'Pluto in Scorpio', 'Pluto in Sagittarius']


As of 2019-11-18 1850, does not seem to work for neptune, seems to work for the rest but need to double check against original website to ensure nothing is missing. lilith website format is different.

In [2]:
# import re
# url = 'www.hello.com/planets-in-sign/sun/'
# tmp = re.split('/', url)
# feature = tmp[-2]
# print(feature)
# #feature = re.search('/*+/$', url)

sun
