-
Notifications
You must be signed in to change notification settings - Fork 0
/
datablitz.py
140 lines (111 loc) · 4.26 KB
/
datablitz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import requests, json, math, locale, re
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import numpy as np
glob_url = f'https://ecommerce.datablitz.com.ph/collections/nintendo-switch' # glob_url == global url
glob_web = requests.get(glob_url)
glob_doc = doc = BeautifulSoup(glob_web.text, 'html.parser')
display_page, nsw_page = 24, 72#3103
consolidated_list = []
total = math.ceil(nsw_page/display_page) + 1
pbar = tqdm(total=total-1)
for i in range(1, total):
pbar.update()
url = f'https://ecommerce.datablitz.com.ph/collections/ps5?page={i}'
web = requests.get(url)
doc = BeautifulSoup(web.text, 'html.parser')
product = doc.find_all(class_='product-item__title text--strong link')
price = doc.find_all(class_='product-item__price-list price-list')
stock = doc.find_all(string=['Add to cart', 'Sold out'])
for product, price, stock in zip(product, price, stock):
raw_dict = {'product_name':product.string, 'price': list(price)[0].string, 'availability': stock.string}
consolidated_list.append(raw_dict)
# Convert into dataframe so perform data transformation
df = pd.json_normalize(consolidated_list)
############################################################
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') # setting up locale to convert string with ##,###.00 format
df['price'] = df['price'].str.replace('₱','').apply(locale.atof).astype(float) # converting price string to float
df['availability'] = np.where(df['availability'].str.lower() == 'add to cart', 'In stock', 'Sold out')
def region_check(self):
if re.search(r'\(mde', self.lower()):
return 'MDE'
elif re.search(r'\(us', self.lower()):
return 'US'
elif re.search(r'\(eu', self.lower()):
return 'EU'
elif re.search(r'\(jpn', self.lower()):
return 'JPN'
elif re.search(r'\(asian', self.lower()):
return 'Asian'
elif re.search(r'\(au', self.lower()):
return 'AU'
elif re.search(r'\(ntsc', self.lower()):
return 'NTSC'
elif re.search(r'\(pal', self.lower()):
return 'PAL'
else:
return 'Not Specified'
df['region'] = df['product_name'].apply(region_check)
# Platform list
platforms = [
'NSW',
'PS4',
'PS5',
'XBOX ONE',
'XBOXSX',
'PC',
'N-SWITCH',
'XBOXONE'
]
pc_components = [
'laptop',
]
def platform_check(self):
if re.search(r'\bmulti-platform', self.lower()):
return 'Multi-Platform'
elif sum(map(self.count, platforms)) > 1:
return 'Multi-Platform'
elif re.search(r'\bnsw', self.lower()):
return 'NSW'
elif re.search(r'\bps4', self.lower()):
return 'PS4'
elif re.search(r'\bps5', self.lower()):
return 'PS5'
elif re.search(r'\bxbox one', self.lower()):
return 'XBOX ONE'
elif re.search(r'\bxboxsx', self.lower()):
return 'XBOX Series S/X'
elif re.search(r'\bpc', self.lower()):
return 'PC'
elif any(re.search(pc_component, self.lower()) for pc_component in pc_components):
return 'PC'
else:
return 'Miscellaneous'
df['platform'] = df['product_name'].apply(platform_check)
def specification_check(self):
accessories = ['thumbstick',
'cover',
'card',
'case',
'analog',
'protector',
'thumb',
'grip',
'controller',
'headset',
'stereo',
'speaker'
]
if any(re.search(accessories, self.lower()) for accessories in accessories):
return 'Accessories'
elif any(re.search(accessories, self.lower()) for accessories in accessories) == False and re.search('pre-order', self.lower()) is not None:
return ' Game Pre-Order'
else:
return 'Game'
df['product_type'] = df['product_name'].apply(specification_check)
# Remove platform name in product_name
df['product_name'] = df['product_name'].str.replace(r'\((.*?)\)', '', regex=True)
# Remove parentheses and data inside on product_name
df['product_name'] = df['product_name'].str.replace(r'\b{}\b'.format('|'.join(platforms)), '', regex=True).str.title()
data_nsw = df.to_json(orient='records')