-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_company_info.py
67 lines (57 loc) · 2.53 KB
/
get_company_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
import random
import time
import logging
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from .step import Step
class GetCompanyInfo(Step):
def process(self, passing_data, inputs):
logger = logging.getLogger()
client = MongoClient()
company_info_list = []
secret_key = {'e_secret_key': 'waimaoba'}
# 'http://xp.waimaoba.com/thread-1139756.htm'
count = 1
for url in passing_data:
logger.info("{0}/{1}".format(count, len(passing_data)))
if client.waimaoba_db.company_information.count_documents({'FromURL': url}, limit=1) != 0:
logger.info('already exists')
count += 1
else:
logger.info("爬曲ing: {}".format(url))
sleep_time = random.randint(80, 130) / 100
time.sleep(sleep_time)
r = requests.post(url, data=secret_key)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.text, 'html.parser')
# print(soup.prettify())
table = soup.find_all('div', {'class': 'message break-all'})[0]
string = table.get_text(strip=True, separator=' ')
try:
company_name = re.search(r'Company Name \(公司名称\):(.+) Industry Category \(行业分类\)',\
string).group(1).strip()
except AttributeError as e:
logger.warning("Can't find CompanyName")
company_name = ''
try:
contact_person = re.search(r'Contact Person:(.+) Tel', string).group(1)
except AttributeError as e:
logger.warning("Can't find ContactPerson")
contact_person = ''
try:
email = re.search(r'Email Address \(电子邮件\):(.+)', string).group(1).strip()
except AttributeError as e:
logger.warning("Can't find Email")
email = ''
company_info= {
'CompanyName': company_name,
'ContactPerson': contact_person,
'Email': email,
'FromURL': url,
}
company_info_list.append(company_info)
count += 1
# print(company_info_list)
return company_info_list