In [150]:
import os
import json
import time
import requests
import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime, timedelta

pd.get_option("display.max_columns")

20

In [166]:
def print_tree(source, limit=0, n=0):
    """
    Function prints the structure (a "tree") of json code. The depth can be limited by passing "limit". n=0 starts a counter from 0.
    """
    
    if n == limit and limit:
        return
    for key in source:
        print("    " * n, n+1, "-", key)
        if isinstance(source[key], dict):
            print_tree(source[key], limit, n+1)

In [183]:
def parsing(pages=0):
    """
    Function parses autodiler.me website. As a result we obtain pandas dataframe with all the advertisements.
    """
    
    # new line for f-strings
    nl = '\n'
    # start stopwatch
    started = time.time()
    
    url = 'https://www.autodiler.me/automobili/all/'
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'}

    n = int()
    while True:
        # time lapsed
        lapsed = int(time.time() - started)
        # page number
        n += 1
        print(f">>>   {lapsed} s   Parsing page number  {n}", end='   ')
        
        r = requests.get(url + str(n), headers=headers)
        if r.status_code != 200:
            print(f'status_code:  {r.status_code}')

        # optionally we can save the response to a file
#         with open(f'/home/pavlk/Documents/python-projects/autodiler/pages/page-{num}.txt', 'wb') as f:
#             f.write(r.content)

        # and then work with every file individually
#         files_dir = os.path.join(os.getcwd(), 'pages')
#         files = os.scandir(files_dir)
#         for file in files:
#             code = BeautifulSoup(open(os.path.join(files_dir, file), 'rb'), 'html.parser')

        # pass response to Beautiful Soup for parsing
        code = BeautifulSoup(r.content, 'html.parser')
        # find all script tags and take the second one
        script_tag = code.find_all('script')[1].string
        # strip all the unnecessary information in that tag
        json_string = script_tag[
            script_tag.find('{'):
            script_tag.find('module={')-1
            ].strip()
        # load parsed json string
        src = json.loads(json_string)

        # if necessary
#         print_tree(src)

        data = pd.DataFrame(src['props']['initialState']['ads']['list']['elements'])

        # exit the cycle if nothing was obtained
        if data.shape[0] == 0 and n == 1:
            print(f'data.shape:  {data.shape}')
            return None
        elif data.shape[0] == 0:
            print(f'{nl}{nl}done!')
            return ads
        
        
        
        # MOVING TO PANDAS AND DATA PROCESSING
        
        # kick out the first wierd row and column
        data.dropna(subset=['code'], inplace=True)
        # drop useless columns
        data.drop(labels=['banner', 'labelDescription', 'compatibilities'], axis=1, inplace=True)
        # look foor empty columns...
        empty = data.count().loc[data.count()==0].index
        # ...and drop them
        data = data.drop(labels=empty, axis=1)
        
        # get ad image
        # if statement in lambda checks whether info exists, i.e. list populated
        data['media'] = data['media'].apply(lambda x: x[0]['fd'] if bool(len(x)) else None)
        
        # function for building publication link
        def generate_adlink(row):
            return 'https://www.autodiler.me/' + row['categoryTitle'] + '/' + row['seo']
        
        # create publications links
        data['ad_link'] = data.apply(generate_adlink, axis=1)
        
        # get ad location and location id
        data['country'] = data['city'].dropna().apply(lambda x: x['country'])
        data['region'] = data['city'].dropna().apply(lambda x: x['region'])
        data['city_id'] = data['city'].dropna().apply(lambda x: x['id'])
        data['city'] = data['city'].dropna().apply(lambda x: x['name'])
        
        # get vehicle make and its id
        data['brand_id'] = data['brand'].dropna().apply(lambda x: x['id'])
        data['brand'] = data['brand'].dropna().apply(lambda x: x['name'])
        
        # get vehicle model, series and model id
        data['product_id'] = data['product'].dropna().apply(lambda x: x['id'])
        data['series'] = data['product'].dropna().apply(lambda x: x['series'])
        data['model'] = data['product'].dropna().apply(lambda x: x['model'])
        
        # drop thumbnail column
        data = data.drop('thumbnail', axis=1)

        # process a set of dates
        for date_col in ['createdAt', 'listingExpiration', 'listingActivated', 'activated']:
            # !sometimes columns "listingExpiration" and "listingActivated" are excluded
            if date_col in data.columns:
                data[date_col] = pd.to_datetime(data[date_col], format='%Y-%m-%dT%H:%M:%S.000Z')

        # functions to get fuel type, milage and year
        def get_fuel(spec_list):
            for spec in spec_list:
                if str(spec['title']).lower().startswith('gorivo'):
                    return spec['value']

        def get_year(spec_list):
            for spec in spec_list:
                if str(spec['title']).lower().startswith('godi'):
                    return spec['value']

        def get_km(spec_list):
            for spec in spec_list:
                if str(spec['title']).lower().startswith('kilometra'):
                    return spec['value']
                
        # apply above metioned functions
        data['fuel'] = data['specifications'].dropna().apply(get_fuel)
        data['km'] = data['specifications'].dropna().apply(get_km)
        data['year'] = data['specifications'].dropna().apply(get_year)
        
        # appending dataframes
        if n == 1:
            ads = data.copy()
            print(f"{ads.shape[0]} rows")
        else:
            ads = ads.append(data, ignore_index=True)
            print(f"{ads.shape[0]} rows")
            
        # break if page limit was set
        if n == pages:
            print(f'{nl}{nl}{n}  pages were parsed')
            return ads

In [184]:
ads = parsing()

>>>   0 s   Parsing page number  1   36 rows
>>>   1 s   Parsing page number  2   72 rows
>>>   3 s   Parsing page number  3   108 rows
>>>   4 s   Parsing page number  4   144 rows
>>>   6 s   Parsing page number  5   180 rows
>>>   8 s   Parsing page number  6   216 rows
>>>   11 s   Parsing page number  7   252 rows
>>>   13 s   Parsing page number  8   288 rows
>>>   15 s   Parsing page number  9   324 rows
>>>   16 s   Parsing page number  10   360 rows
>>>   18 s   Parsing page number  11   396 rows
>>>   20 s   Parsing page number  12   432 rows
>>>   22 s   Parsing page number  13   468 rows
>>>   24 s   Parsing page number  14   504 rows
>>>   26 s   Parsing page number  15   540 rows
>>>   27 s   Parsing page number  16   576 rows
>>>   29 s   Parsing page number  17   612 rows
>>>   31 s   Parsing page number  18   648 rows
>>>   32 s   Parsing page number  19   684 rows
>>>   34 s   Parsing page number  20   720 rows
>>>   36 s   Parsing page number  21   756 rows
>>>   38 

>>>   327 s   Parsing page number  165   5940 rows
>>>   328 s   Parsing page number  166   5976 rows
>>>   331 s   Parsing page number  167   6012 rows
>>>   332 s   Parsing page number  168   6048 rows
>>>   334 s   Parsing page number  169   6084 rows
>>>   337 s   Parsing page number  170   6120 rows
>>>   339 s   Parsing page number  171   6156 rows
>>>   341 s   Parsing page number  172   6192 rows
>>>   342 s   Parsing page number  173   6228 rows
>>>   344 s   Parsing page number  174   6264 rows
>>>   346 s   Parsing page number  175   6300 rows
>>>   347 s   Parsing page number  176   6336 rows
>>>   349 s   Parsing page number  177   6372 rows
>>>   351 s   Parsing page number  178   6408 rows
>>>   353 s   Parsing page number  179   6444 rows
>>>   354 s   Parsing page number  180   6480 rows
>>>   356 s   Parsing page number  181   6516 rows
>>>   358 s   Parsing page number  182   6552 rows
>>>   360 s   Parsing page number  183   6588 rows
>>>   362 s   Parsing page numb

# DEBUGGING ZONE

Getting the code

In [180]:
url = 'https://www.autodiler.me/automobili/all/'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'}

# set page number
n = 197

print(f">>>   Parsing page number  {n}")
r = requests.get(url + str(n), headers=headers)
if r.status_code != 200:
    print(f'status_code: {r.status_code}   >>>   page number: {num}')

# optionally we can save the response to a file
#         with open(f'/home/pavlk/Documents/python-projects/autodiler/pages/page-{num}.txt', 'wb') as f:
#             f.write(r.content)

# and then work with every file individually
#         files_dir = os.path.join(os.getcwd(), 'pages')
#         files = os.scandir(files_dir)
#         for file in files:
#             code = BeautifulSoup(open(os.path.join(files_dir, file), 'rb'), 'html.parser')

# pass response to Beautiful Soup for parsing
code = BeautifulSoup(r.content, 'html.parser')
# find all script tags and take the second one
script_tag = code.find_all('script')[1].string
# strip all the unnecessary information in that tag
json_string = script_tag[
    script_tag.find('{'):
    script_tag.find('module={')-1
    ].strip()
# load parsed json string
src = json.loads(json_string)

# if necessary
#         print_tree(src)

data = pd.DataFrame(src['props']['initialState']['ads']['list']['elements'])

# exit the cycle if nothing was obtained
if data.shape[0] == 0:
    print('::ERROR::Something has changed...')

>>>   Parsing page number  197


MOVING TO PANDAS AND DATA PROCESSING

In [181]:
# kick out the first wierd row and column
data.dropna(subset=['code'], inplace=True)
# drop useless columns
data.drop(labels=['banner', 'labelDescription', 'compatibilities'], axis=1, inplace=True)
# look foor empty columns...
empty = data.count().loc[data.count()==0].index
# ...and drop them
data = data.drop(labels=empty, axis=1)

# get ad image
# if statement in lambda checks whether info exists, i.e. list populated
data['media'] = data['media'].apply(lambda x: x[0]['fd'] if bool(len(x)) else None)

# function for building publication link
def generate_adlink(row):
    return 'https://www.autodiler.me/' + row['categoryTitle'] + '/' + row['seo']

# create publications links
data['ad_link'] = data.apply(generate_adlink, axis=1)

# get ad location and location id
data['country'] = data['city'].dropna().apply(lambda x: x['country'])
data['region'] = data['city'].dropna().apply(lambda x: x['region'])
data['city_id'] = data['city'].dropna().apply(lambda x: x['id'])
data['city'] = data['city'].dropna().apply(lambda x: x['name'])

# get vehicle make and its id
data['brand_id'] = data['brand'].dropna().apply(lambda x: x['id'])
data['brand'] = data['brand'].dropna().apply(lambda x: x['name'])

# get vehicle model, series and model id
data['product_id'] = data['product'].dropna().apply(lambda x: x['id'])
data['series'] = data['product'].dropna().apply(lambda x: x['series'])
data['model'] = data['product'].dropna().apply(lambda x: x['model'])

# drop thumbnail column
data = data.drop('thumbnail', axis=1)

# process a set of dates
for date_col in ['createdAt', 'listingExpiration', 'listingActivated', 'activated']:
    # !sometimes columns "listingExpiration" and "listingActivated" are excluded
    if date_col in data.columns:
        data[date_col] = pd.to_datetime(data[date_col], format='%Y-%m-%dT%H:%M:%S.000Z')

# functions to get fuel type, milage and year
def get_fuel(spec_list):
    for spec in spec_list:
        if str(spec['title']).lower().startswith('gorivo'):
            return spec['value']

def get_year(spec_list):
    for spec in spec_list:
        if str(spec['title']).lower().startswith('godi'):
            return spec['value']

def get_km(spec_list):
    for spec in spec_list:
        if str(spec['title']).lower().startswith('kilometra'):
            return spec['value']

# apply above metioned functions
data['fuel'] = data['specifications'].dropna().apply(get_fuel)
data['km'] = data['specifications'].dropna().apply(get_km)
data['year'] = data['specifications'].dropna().apply(get_year)

In [182]:
data['specifications'].dropna()

0     [{'measure': 'km', 'weightCatalog': 1, 'icon':...
1     [{'measure': None, 'weightCatalog': 3, 'icon':...
2     [{'measure': None, 'weightCatalog': 3, 'icon':...
3     [{'measure': 'km', 'weightCatalog': 1, 'icon':...
4     [{'measure': None, 'weightCatalog': 2, 'icon':...
5     [{'measure': 'km', 'weightCatalog': 1, 'icon':...
6     [{'measure': 'km', 'weightCatalog': 1, 'icon':...
7     [{'measure': None, 'weightCatalog': 2, 'icon':...
8     [{'measure': None, 'weightCatalog': 2, 'icon':...
9     [{'measure': None, 'weightCatalog': 3, 'icon':...
10    [{'measure': 'km', 'weightCatalog': 1, 'icon':...
11    [{'measure': 'km', 'weightCatalog': 1, 'icon':...
13    [{'measure': None, 'weightCatalog': 3, 'icon':...
14    [{'measure': None, 'weightCatalog': 3, 'icon':...
15    [{'measure': None, 'weightCatalog': 3, 'icon':...
16    [{'measure': 'km', 'weightCatalog': 1, 'icon':...
17    [{'measure': None, 'weightCatalog': 2, 'icon':...
18    [{'measure': None, 'weightCatalog': 3, 'ic

# Personal Use

In [185]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9972 entries, 0 to 9971
Data columns (total 39 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   code                       9972 non-null   object        
 1   city                       9972 non-null   object        
 2   media                      9971 non-null   object        
 3   specifications             9971 non-null   object        
 4   createdAt                  9972 non-null   datetime64[ns]
 5   price                      9972 non-null   float64       
 6   listingStatus              971 non-null    object        
 7   listingExpiration          964 non-null    datetime64[ns]
 8   id                         9972 non-null   float64       
 9   seo                        9972 non-null   object        
 10  listing                    9972 non-null   object        
 11  brand                      9971 non-null   object        
 12  produc

In [189]:
data = ads.drop(ads.columns[-4:], axis=1)

In [190]:
data.columns

Index(['code', 'city', 'media', 'specifications', 'createdAt', 'price',
       'listingStatus', 'listingExpiration', 'id', 'seo', 'listing', 'brand',
       'product', 'refreshed', 'oldPrice', 'categoryTitle', 'priceOnRequest',
       'priceType', 'listingActivated', 'category', 'user', 'status',
       'titleCompiled', 'activated', 'ad_link', 'country', 'region', 'city_id',
       'brand_id', 'product_id', 'series', 'model', 'fuel', 'km', 'year'],
      dtype='object')

In [191]:
data[['createdAt', 'listingActivated', 'activated', 'listingExpiration']]

Unnamed: 0,createdAt,listingActivated,activated,listingExpiration
0,2021-03-28 08:29:25,2021-05-14 06:49:12,2021-05-15 13:48:23,2021-05-24 06:49:12
1,2021-03-29 16:13:33,2021-05-14 06:49:51,2021-05-15 13:46:45,2021-05-24 06:49:51
2,2021-05-05 20:36:38,2021-05-15 06:37:03,2021-05-15 13:08:28,2021-05-25 06:37:03
3,2021-05-15 11:33:26,2021-05-15 12:30:59,2021-05-15 12:30:59,2021-05-25 12:30:59
4,2021-05-15 12:24:04,2021-05-15 12:25:08,2021-05-15 12:26:02,2021-05-25 12:25:08
...,...,...,...,...
9967,2020-08-15 10:16:57,NaT,2020-12-14 20:11:57,NaT
9968,2020-12-14 17:22:43,NaT,2020-12-14 18:40:36,NaT
9969,2020-12-14 17:43:40,NaT,2020-12-14 18:39:33,NaT
9970,2020-12-14 17:16:12,NaT,2020-12-14 17:18:08,NaT
