In [1]:
# https://diemthi.vnanet.vn
import time
import requests
from threading import Thread
from tqdm import tqdm_notebook

In [2]:
REQUEST_TIME_LIMIT = 120
NAM = 2021 # year of competition  

In [3]:
def get_point(code):
  # description: get point of candidate by candidate identification number:
  # code: candidate identification number
  # nam: year of competition 
  params = (
      ('code', code),
      ('nam', NAM),
  )
  s = requests.session()
  s_time = time.time()
  while True:
    if time.time() - s_time > REQUEST_TIME_LIMIT: # try to get candidate point in the range of REQUEST_TIME_LIMIT second
      break
    try:
      response = s.get('https://diemthi.vnanet.vn/Home/SearchBySobaodanh', params=params)
      return response.json()
    except:
      pass
  raise Exception(f"Cannot request point for candidate with candidate identification number: {code}")

In [4]:
get_point('01000001')

{'result': [{'CityCode': '01',
   'CityArea': None,
   'Code': '01000001',
   'Toan': '2.20',
   'NguVan': '3.50',
   'NgoaiNgu': '',
   'VatLi': '',
   'HoaHoc': '',
   'SinhHoc': '',
   'KHTN': '',
   'DiaLi': '5.50',
   'LichSu': '2.50',
   'GDCD': '',
   'KHXH': '',
   'ResultGroup': '[{"g":"A07","p":10.20},{"g":"C00","p":11.50},{"g":"C03","p":8.20},{"g":"C04","p":11.20}]',
   'Result': ''}]}

In [5]:
get_point('01999999')

{'result': []}

In [6]:
listcityid = [f'{i:02d}' for i in range(1,64+1)]
listcandidate = [f'{i:06d}' for i in range(1,999999+1)]

In [7]:
def get_candidate_list(cityid):
  # description: get list of candidates for each city by city identification number
  # algorithm: binary search
  # cityid: city identification number
  pos = -1
  while True:
    prepos = pos
    l = pos + 1
    r = len(listcandidate)-1
    while l <= r:
      mid = (l+r)>>1
      code = f"{cityid}{listcandidate[mid]}"
      response = get_point(code)
      if response['result'] == []:
        r = mid-1
      else:
        pos = mid
        l = mid+1
    if pos == prepos:
      break
  return [f"{cityid}{i}" for i in listcandidate[:pos+1]]
cityid = listcityid[0]
res = get_candidate_list(cityid)
print(f"Number candidates of city {cityid}: {len(res)}")

Number candidates of city 01: 101288


In [8]:
candidates = []
def collect_candidates(cityid):
  res = get_candidate_list(cityid)
  print(f"Number candidates of city {cityid}: {len(res)}")
  candidates.extend(res)  

threads = []
for cityid in listcityid:
  threads.append(Thread(target = collect_candidates, args = (cityid, )))
  threads[-1].start()
  time.sleep(0.1)
for thread in threads:
  thread.join()

Number candidates of city 20: 0
Number candidates of city 02: 89275
Number candidates of city 08: 7362
Number candidates of city 03: 23564
Number candidates of city 01: 101288
Number candidates of city 04: 12716
Number candidates of city 06: 4768
Number candidates of city 09: 8708
Number candidates of city 07: 3575
Number candidates of city 15: 16215
Number candidates of city 13: 8110
Number candidates of city 11: 2929
Number candidates of city 21: 22347
Number candidates of city 05: 5626
Number candidates of city 19: 16388
Number candidates of city 12: 16973
Number candidates of city 16: 14049
Number candidates of city 42: 14348
Number candidates of city 10: 9573
Number candidates of city 36: 4647
Number candidates of city 25: 20960
Number candidates of city 14: 11428
Number candidates of city 18: 21013
Number candidates of city 29: 34480
Number candidates of city 47: 12089
Number candidates of city 32: 8530
Number candidates of city 26: 22946
Number candidates of city 54: 13791
Numbe

In [9]:
print(f"Total candidates of {NAM}: {len(candidates)}")

Total candidates of 2021: 1021102


In [10]:
batch_size = 2000000
candidates = candidates[:1000] # Shortening candidates list for demo
candidates_result = []
batches = [candidates[i:min(i + batch_size, len(candidates))] for i in range(0, len(candidates), batch_size)]
print(len(batches))
print([len(batch) for batch in batches])

1
[1000]


In [11]:
NUM_FLOW = 10
def process_batch(batch):
  def flow(idflow):
    flow_candidates_position = list(range(idflow, len(batch), NUM_FLOW))
    bar = tqdm_notebook(desc = f'{idflow:03d}-th flow processing', total = len(flow_candidates_position))
    for i in flow_candidates_position:
      res = get_point(batch[i])
      candidates_result.append((batch[i], res))
      bar.update(1)

  threads = []
  for i in range(NUM_FLOW):
    threads.append(Thread(target = flow, args = (i, )))
    threads[-1].start()
    time.sleep(0.1)

  for thread in threads:
    thread.join()
process_batch(batches[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  bar = tqdm_notebook(desc = f'{idflow:03d}-th flow processing', total = len(flow_candidates_position))


000-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

001-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

002-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

003-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

004-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

005-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

006-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

007-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

008-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

009-th flow processing:   0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
print(len(candidates_result))
for x in candidates_result[:100]:
    print(x)

1000
('02000002', {'result': [{'CityCode': '02', 'CityArea': None, 'Code': '02000002', 'Toan': '8.80', 'NguVan': '7.00', 'NgoaiNgu': '9.80', 'VatLi': '', 'HoaHoc': '', 'SinhHoc': '', 'KHTN': '', 'DiaLi': '8.00', 'LichSu': '7.75', 'GDCD': '9.00', 'KHXH': '8.25', 'ResultGroup': '[{"g":"A07","p":24.55},{"g":"A08","p":25.55},{"g":"A09","p":25.80},{"g":"C00","p":22.75},{"g":"C03","p":23.55},{"g":"C04","p":23.80},{"g":"C14","p":24.80},{"g":"C15","p":24.05},{"g":"C19","p":23.75},{"g":"C20","p":24.00},{"g":"D01","p":25.60},{"g":"D09","p":26.35},{"g":"D10","p":26.60},{"g":"D14","p":24.55},{"g":"D15","p":24.80},{"g":"D66","p":25.80},{"g":"D78","p":25.05},{"g":"D84","p":27.60},{"g":"D96","p":26.85}]', 'Result': ''}]})
('02000001', {'result': [{'CityCode': '02', 'CityArea': None, 'Code': '02000001', 'Toan': '8.20', 'NguVan': '7.00', 'NgoaiNgu': '8.20', 'VatLi': '8.00', 'HoaHoc': '8.50', 'SinhHoc': '7.75', 'KHTN': '8.08', 'DiaLi': '', 'LichSu': '', 'GDCD': '', 'KHXH': '', 'ResultGroup': '[{"g":"A00

In [13]:
candidates_result.sort(key=lambda x:x[0])

In [14]:
with open(f"diemthi{NAM}.txt", "w", encoding = "UTF8") as f:
    for x in candidates_result:
        f.write(f"{x}\n")

In [15]:
import csv  

header = ['ID', 'CityCode', 'CityArea', 'Math', 'Literature', 'English', 'Physic', 'Chemistry', 'Biology', 'Geography', 'History', 'Civic Education']

with open(f'diemthi{NAM}.csv', 'w', encoding='UTF8') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    for x in candidates_result:
        data = [x[0]]
        try:
            y = x[1]['result'][0]
            data.extend([y['CityCode'], y['CityArea'], y['Toan'], y['NguVan'], y['NgoaiNgu'], y['VatLi'], y['HoaHoc'], y['SinhHoc'], y['DiaLi'], y['LichSu'], y['GDCD']])
        except Exception as e:
            data.extend([None for i in range(10)])
        writer.writerow(data)
        
