<div style="color: blue;">

# Data Scraping Project

## Submitted By:
- **Niv Harel**: 208665869
- **Eytan Mutzafi**: 209160308

#### Github: https://github.com/nivrl/Data_Course_3rd_year.git

In [578]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import numpy as np

<div style="color: blue;">

### Getting the url for every car

In [579]:
def get_page_cars_url(page_num):
    url = f'https://www.ad.co.il/car?sp261=13905&pageindex={page_num}'
    html = requests.get(url)
    soup = BeautifulSoup(html.content,'html.parser')
    basic_url = 'https://www.ad.co.il'
    lst1= []
    x=0
    products = soup.find('div',class_="product-list d-flex")
    tags = products.find_all('div',class_="card-body p-md-3")
    for tag in tags:
        all_a_tags = tag.find('a')
        lst1.append(basic_url+all_a_tags.get('href'))
    return lst1


In [580]:
def get_car_page_content(url):
    html = requests.get(url)
    page_soup = BeautifulSoup(html.content,'html.parser')
    return page_soup

<div style="color: blue;">

### Retrieving raw data about every car

In [581]:
def get_car_data(page_url):
## Retrieving the necessary data from the car URL and convert it into a dictionary.
    page=get_car_page_content(page_url)
    page_data={}
    full_contact=page.find('div',class_="single-product-tab tab-pane fade show active")
    first_details=full_contact.find('div',class_="d-flex justify-content-between").find_all('h2')
    for i in range(len(first_details)):
        if i == 0:
            full_statement = first_details[i].get_text().split(' ',1)
            page_data['יצרן'] = full_statement[0]
            page_data['דגם'] = full_statement[-1]
        else:
            page_data['סכום']=first_details[i].get_text()


    table=full_contact.find('table',class_="table table-sm mb-4").find_all('td')
    for i in range(1,len(table),2):
        value=table[i].get_text().replace('\r\n','').replace('  ','')
        key=table[i-1].get_text().replace('\r\n','').replace('  ','')
        page_data[key]=value
    try:
        images=full_contact.find('div',class_="col-12 d-flex mt-3 justify-content-center flex-wrap")
        page_data['Pic_num']=len(images.find_all('div',class_="justify-content-center px-1"))
    except:
        page_data['Pic_num']=0
            
    descriptions=full_contact.find('div',class_="order-first order-sm-0").find_all("p",class_="text-word-break")
    full_text=''
    for i in range(len(descriptions)):
        full_text+=descriptions[i].get_text()+' '
    descr=full_text.replace('\n','').replace('\t','').replace('\r','').replace('תוספות',' ')
    page_data['Description'] = descr.replace('&lt;br/&gt;&lt;br/&gt;פירוט:&lt;br/&gt;', 'פירוט:')
    
    dates=full_contact.find('div',class_="d-flex flex-row align-items-center justify-content-center flex-wrap").find_all("div",class_="px-3")

    for i in dates:
        val=i.get_text().split(' ')[-1]
        key=i.get_text().split(' ')[1]
        page_data[key]=val
        
    return page_data

In [582]:
def append_dict_to_df(df, data_dict):
## Appending each car dict to the main DF.
    temp_df = pd.DataFrame([data_dict])
    df = pd.concat([df, temp_df], ignore_index=True)
    return df


In [583]:
def create_raw_data():
    final_df = pd.DataFrame()
    page_num=1
    urls = get_page_cars_url(page_num)
    while len(urls)>0:
        for car_url in urls:
            page_dict = get_car_data(car_url)
            final_df=append_dict_to_df(final_df,page_dict)
        page_num +=1
        urls = get_page_cars_url(page_num)
    return final_df

<div style="color: blue;">

### Organizing the data & Converting the column types

In [584]:
def convert_to_last_day_of_month(date_str):
    return (pd.to_datetime(date_str, format='%m/%Y') + pd.offsets.MonthEnd(0)).strftime('%d/%m/%Y')

In [585]:
def organizing_data(data):  
    new_column_names = [
    'Manufactor','Model','Price','Year', 'Hand', 'Gear', 'Engine_capacity', 
    'Engine_type','Km', 'Test_date','Color','Prev_ownership','Curr_ownership', 'Area', 'City', 
    'Pic_num','Description','Cre_date', 'Repub_date'
    ]
    
    data.columns = new_column_names
    for name in new_column_names:
        if name in ['Year','Hand','Engine_capacity','Km']:
            data[name] = pd.to_numeric(data[name], errors='coerce').astype('Int64')
        elif name == 'Pic_num':
            data[name] = data[name].astype('Int64')
        elif name == 'Price':
            data[name] = data[name].str.replace(',', '')
            data[name] = data[name].str.replace(' ', '')
            data[name] = data[name].str.replace('₪', '')
            data[name] = data[name].fillna(np.nan)
            data[name] = data[name].astype(float)
        elif name in ['Gear','Engine_type','Prev_ownership','Curr_ownership']:
            data[name] = data[name].astype('category')
        elif name in ['Cre_date','Repub_date']:
            data[name] = pd.to_datetime(data[name], format='%d/%m/%Y')
        elif name == 'Test_date':
            data[name] = data[name].fillna('01/1800')
            data[name] = data[name].apply(lambda x: convert_to_last_day_of_month(x))
            data[name] = pd.to_datetime(data[name], format='%d/%m/%Y')

    data = data[(data['Year'] >= 1980) & (data['Year'] <= 2016)].reset_index().iloc[:,1:]
    return data


In [586]:
def convert_test_into_int(data):
        data['Test'] = (data['Test_date']- pd.to_datetime(datetime.today().date())).dt.days
        data['Test'] = data['Test'].apply(lambda x: x if x >= 0 else (0 if x > -20000 else np.nan))
        data['Test'] = pd.to_numeric(data['Test'], errors='coerce').astype('Int64')
        return data

<div style="color: blue;">

### Executing all the functions and converting the DF into a CSV file

In [587]:
def execute_full_process():
        data = create_raw_data()
        data = organizing_data(data)
        data = convert_test_into_int(data)
        
        actual_columns_list = [
            'Manufactor','Year','Model','Hand','Gear','Engine_capacity','Engine_type',
            'Prev_ownership','Curr_ownership','Area','City','Price',
            'Pic_num','Cre_date','Repub_date','Description','Color','Km','Test'
            ]        
        return data.loc[:,actual_columns_list]


In [588]:
def load_to_csv():
    data = execute_full_process()
    data.to_csv("Data_for_project.csv", encoding='utf-8-sig', index=False)
    return data.dtypes

In [589]:
load_to_csv()

Manufactor                 object
Year                        Int64
Model                      object
Hand                        Int64
Gear                     category
Engine_capacity             Int64
Engine_type              category
Prev_ownership           category
Curr_ownership           category
Area                       object
City                       object
Price                     float64
Pic_num                     Int64
Cre_date           datetime64[ns]
Repub_date         datetime64[ns]
Description                object
Color                      object
Km                          Int64
Test                        Int64
dtype: object

In [590]:
df= pd.read_csv("Data_for_project.csv")
df.head()

Unnamed: 0,Manufactor,Year,Model,Hand,Gear,Engine_capacity,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Cre_date,Repub_date,Description,Color,Km,Test
0,מיצובישי,2005,לנסר קלאסיק,8,אוטומטית,,בנזין,פרטית,פרטית,ירושלים והסביבה,ירושלים,4700.0,3,2024-05-22,2024-05-27,רכב במצב פצצה שירת אותי נאמנה הוחלף לפני חודש ...,כסוף,,147.0
1,מיצובישי,2016,GT3000,2,אוטומטית,,בנזין,פרטית,פרטית,חיפה וחוף הכרמל,נשר,92000.0,4,2024-05-05,2024-05-05,אאוטלנדר המפואר פרמיום 7 מקומות טופל אצל היבו...,לבן,,
2,מיצובישי,2016,I-MIEV,2,אוטומטית,,בנזין,פרטית,פרטית,חיפה וחוף הכרמל,נשר,92000.0,4,2024-05-05,2024-05-04,אאוטלנדר הדגם המפואר 7 מקומות שמור ומטופל אצל ...,לבן,,267.0
3,מיצובישי,2010,לנסר,3,אוטומטית,,בנזין,פרטית,פרטית,נתניה והסביבה,נתניה,10000.0,7,2024-03-16,2024-03-16,גיר מנוע תקינים מעככת מוביליין רכב 7 שנים עצלי...,כסוף,,177.0
4,מיצובישי,2008,לנסר,3,אוטומטית,,בנזין,פרטית,פרטית,חיפה וחוף הכרמל,חיפה,7500.0,6,2023-11-06,2023-12-17,לרציניים בלבד!!מוכר את הרכב שלי בצער רב. האוטו...,כחול,,
