## Step1 : Extracting data from the text files.

In [84]:
import warnings
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd
import re
warnings.simplefilter(action='ignore', category=FutureWarning)

In [85]:
### Create the necessary dataframes for extracting features

def initialize_df():

    global features, maping_description, maping_additional

    features = {
        'name': None,
        'model': None,
        'price': None,
        'color': None,
        'fuel_type': None,
        'origin_car': None,
        'car_license': None,
        'lime_type': None,
        'glass': None,
        'motor_power': None,
        'car_speedometer': None,
        'passengers': None,
        'ex_owners': None,
        'alarm_devise':None,
        'air_conditioner':None,
        'CD_recorder':None,
        'sunroof':None,
        'leather_upholstery':None,
        'central_closing':None,
        'magnesium_rims':None,
        'air_cushion':None,
        }
    maping_description = {
        "لون السيارة": 'color',
        "نوع الوقود": 'fuel_type',
        "أصل السيارة": 'origin_car',
        "عداد السيارة": 'car_speedometer',
        "أصحاب سابقون": 'ex_owners',
        "رخصة السيارة": 'car_license',
        "نوع الجير": 'lime_type',
        "الزجاج": 'glass',
        "قوة الماتور": 'motor_power',
        "عدد الركاب": 'passengers',
    }
    maping_additional={
        'جهاز إنذار': 'alarm_devise',
        'مُكيّف': 'air_conditioner',
        'مسجل CD': 'CD_recorder',
        'فتحة سقف': 'sunroof',
        'فرش جلد': 'leather_upholstery',
        'إغلاق مركزي': 'central_closing',
        'جنطات مغنيسيوم': 'magnesium_rims',
        'وسادة حماية هوائية': 'air_cushion',
    }

In [86]:
### Extracting model year features from file to features dictionary.


def get_model(model_element):

    for row in model_element:
        subitem = str(row.text).split()
        for item in subitem:
            if(item.isdigit()):
                features['model'] = item
                break

In [87]:
### Extracting price feature from file to features dictionary.


def get_price(price_element):
    if price_element:
        price = str(price_element.text)
        for subitem in price.split():
            if subitem.isdigit():
                return int(subitem)
    return None 


In [88]:
### Extracting description features from file to features dictionary.

def get_description(description_element):
    if description_element:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            for arabic_feature in maping_description.keys():
                description = description_element.find(string=arabic_feature)
                features[maping_description[arabic_feature]] = (
                    None if description is None else description.next_element.get_text()
                )
        
        

In [89]:
def get_additional_info(additional_element):
    for row in additional_element:
        row = row.text
        if row in maping_additional.keys():
            features[maping_additional[row]] = 1

In [None]:
# Create dataframe to add the data collected
data = pd.DataFrame()

for path in Path('data/').glob('*.txt'):
    initialize_df()
    with path.open(encoding='utf-8') as file_path:
        # create BeautifulSoup object for file by html parser
        soup = BeautifulSoup(file_path, "html.parser")

        # Extracting name value to features dictionary
        features['name'] = soup.find('h3', class_=None).text

        # Extracting model value to features dictionary
        get_model(soup.find('h5', class_=None))

        # Extracting price value to features dictionary
        get_price(soup.find('h5', class_='post-price'))

        # Extracting description values to features dictionary
        get_description(soup.find('table', class_='list_ads'))

        
        # Extracting additional values to features dictionary
        get_additional_info(soup.find('td', class_='list-additions').find_all('li'))
       
        
        # Convert the dictionary to series
        sample = pd.Series(features)

        # Concating the sample series to the dataframe
        data = pd.concat((data, sample), axis=1, ignore_index=True)

In [None]:
data = data.T
data

In [72]:
data.to_csv('data/data.csv')