In [1]:
# imports

import os
import pandas as pd
import re

### Define functions to get files and manipulate dataframes

In [50]:
# get all files full paths in data folder
def get_files(data_dir='data'):
    is_dir = os.path.isdir(data_dir)
    if not is_dir:
        print(f'\"{os.getcwd()}/{data_dir}\" directory does not exist, please specify correct data directory.')
        exit()
    files = []
    for root, dirs, filenames in os.walk(data_dir):
        for name in filenames:
            f = os.path.join(root, name)
            if f.endswith('data.csv'):
                files.append(f)
            else:
                continue

    return files


# rename columns: make lowercase, replace 'space' with '_'
def rename_cols(df):
    cols = []
    for col in df.columns.to_list():
        col = name.strip().replace(' ', '_').lower()
        cols.append(col)
    df.columns = cols


# populate list by list of filenames grouped by property type
def df_list():
    files = sorted(get_filenames())
    df_list = []
    i_temp = 0
    tmp_list = []
    for i in range(len(files)-1):
        current_name = files[i_temp].split('/')[-1].split('_')[0]
        next_name = files[i+1].split('/')[-1].split('_')[0]
        if current_name == 'Rooms for rent':
            df_list.append(files[i_temp:])
            break
        elif current_name == next_name:
            tmp_list.append(files[i+1])
        else:
            tmp_list.insert(0, files[i_temp])
            df_list.append(tmp_list)
            tmp_list = []
            i_temp = i+1
    return df_list


# concatenate csv files that are same property types
def df_concat(file_list):
    df_merged = pd.DataFrame()
    for file in file_list:
        region = df_name.split('/')[-2]
        df_to_merge = pd.read_csv(file)
        if len(df_to_merge) < 1:
            continue
        df_to_merge['region'] = region
        frames = [df_merged, df_to_merge]
        df_merged = pd.concat(frames, ignore_index=True)
    return df_merged


# fix data displacement occured while scraping
def fix_displacement(files):
    df = df_concat(files)
    df_fix = df[~df['Links'].str.startswith('https', na=False)].loc[:, 'Datetime':]
    links = []
    date = []

    for row in df_fix.astype(str).values:
        l = len(row)
        for i in range(len(row)):
            if row[i].startswith('https'):
                links.append(row[i])
                if len(links) > len(date):
                    date.append(tmp_date)
                break
            if re.match(r'^[A-Z][a-z]{2}-\d+-\d+_\d+-\d+', row[i]):
                tmp_date = row[i]
                date.append(row[i])

    df.loc[~df['Links'].str.startswith('https', na=False), 'Datetime'] = date
    df.loc[~df['Links'].str.startswith('https', na=False), 'Links'] = links
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df


def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    # df_merged['Price'] = df_merged['Price'].replace({'\$': '', '֏': '', '₽': '', '€': '', ',': ''}, regex=True)
    if isinstance(x, str):
        return x.replace('$', '').replace('֏', '').replace('₽', '').replace('€', '').replace(',', '')

    
# split 'Price' column (e.g. 10000 monthly) to 2 columns and reorder df columns
def split_price_col(df):
    df[['Price', 'Duration']] = df.Price.str.split(expand=True)
    df['Price'] = df.loc[:, 'Price'].astype(int)
    cols = df.columns.to_list()
    idx_currency = cols.index('Currency')
    idx_duration = cols.index('Duration')
    cols.insert(idx_currency+1, 'Duration')
    cols.pop(idx_duration)
    df = df[cols]
    return df
    

In [51]:
df_li = df_list()

In [53]:
df = fix_displacement(df_li[0])

  df_to_merge = pd.read_csv(df_name)


In [45]:
pd.set_option('display.max_rows', 10)

In [54]:
df[~df['Links'].str.startswith('https', na=False)].loc[:, 'Datetime':]

Unnamed: 0,Datetime,Window Views,Links,region,Type,Room Area


In [59]:
df['Price'] = df['Price'].apply(clean_currency)

In [61]:
df['Price']

0         50000  monthly
1         60000  monthly
2           16000  daily
3        250000  monthly
4        250000  monthly
              ...       
33132       1900 monthly
33133    200000  monthly
33134    240000  monthly
33135       12000  daily
33136       1200 monthly
Name: Price, Length: 33137, dtype: object

Unnamed: 0,Price,Currency,Duration,Description,Datetime,Window Views,Links,region,Type,Room Area
0,50000,AMD,monthly,"Apartment for rent in Aparan city, partially r...",Dec-15-2022_21-12,,https://list.am/en/item/18371048,Aragatsotn,,
1,60000,AMD,monthly,It is for rent. The building is located on the...,Dec-15-2022_21-12,,https://list.am/en/item/17968939,Aragatsotn,,
2,16000,AMD,daily,"New unique house in Ashtarak, with all ameniti...",Dec-15-2022_21-12,Yard view,https://list.am/en/item/16785815,Aragatsotn,,
3,250000,AMD,monthly,"150 sq. M. Apartment, 5 rooms. A large barn. G...",Dec-15-2022_21-12,,https://list.am/en/item/17394032,Aragatsotn,,
4,250000,AMD,monthly,Fully renovated apartment for rent in Ajapnyak...,Dec-15-2022_21-12,,https://list.am/en/item/18072997,Aragatsotn,,
...,...,...,...,...,...,...,...,...,...,...
33132,1900,USD,monthly,"Բնակարանը վերանորոգված է, համալրված անհրաժեշտ ...",Dec-15-2022_21-09,,https://list.am/en/item/15855170,Yerevan,,
33133,200000,AMD,monthly,"The apartment is completely renovated, all fac...",Dec-15-2022_21-09,,https://list.am/en/item/18569064,Yerevan,,
33134,240000,AMD,monthly,"The apartment is completely renovated, all fac...",Dec-15-2022_21-09,,https://list.am/en/item/18527330,Yerevan,,
33135,12000,AMD,daily,A modern one-room apartment is for daily rent ...,Dec-15-2022_21-09,,https://list.am/en/item/12642302,Yerevan,,
