In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time
import re
import matplotlib.pyplot as plt

In [205]:
def create_newrow(cols_list,data_list):
       
    newrow_dict = {}
    for i,j in zip(cols_list,data_list):
        newrow_dict[i] = j
        
    return newrow_dict

def merge_df(newrow_dict,target_df):
    
    newrow_df = pd.DataFrame(newrow_dict, index=[len(target_df)])
    target_df = pd.concat([target_df,newrow_df])
    
    return target_df

def get_pages(soup):
       
    p_num = 0
    nav = soup.find("nav",attrs={'id':'srp-pagination'})
    for page in nav.find_all("li",attrs={'class':''}):
        if (page.find("a")) is not None:
            p_num = int(page.find("a").text)

    return p_num

def get_soup(i,wt):
    
    time.sleep(wt)
    
    print("Retrieving content from page: ",i+1)
        
    webp = 'https://www.lahaus.mx/venta/propiedades/area-metropolitana-monterrey'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    
    response = requests.get(webp + '?pagina=' + str(i+1),headers=headers)
    content = response.content
    soup = BeautifulSoup(content,"html.parser")  
    
    return(soup)


def get_categories(res,cat):
    
    temp_list = []
    
    jungle_green = res.find("span",attrs={'class':'leading-8 text-12 lg:text-10 xl:text-12 rounded-sm text-lh-jungle-green bg-lh-caribbean-green px-lh-8 py-lh-4 mr-5 truncate'})
    green_gray = res.find("span",attrs={'class':'leading-10 text-12 lg:text-10 xl:text-12 border rounded-sm text-lh-green-gray border-lh-gray px-lh-8 mr-5 truncate'})
    used_green = res.find("span",attrs={'class':'leading-8 text-12 lg:text-10 xl:text-12 rounded-sm text-success-800 bg-success-50 px-lh-8 py-lh-4 mr-5 truncate'})
    yellow = res.find("span",attrs={'class':'leading-8 text-12 lg:text-10 xl:text-12 rounded-sm text-lh-jungle-green bg-lh-yellow px-lh-8 py-lh-4 mr-5'})
    
    if (jungle_green) is not None:
        temp_list.append(jungle_green.text)
        cat = 1
    if (used_green):
        temp_list.append(used_green.text)
        cat = 2
    if (yellow):
        temp_list.append(yellow.text)
        cat = 3
    if (green_gray) is not None:
        temp_list.append(green_gray.text)
    
    return temp_list,cat

def build_csv_export(df):
    
    from datetime import datetime
    timestamp = datetime.now()
    timestamp = timestamp.strftime("%Y%m%d%H%M%S")
        
    print("Starting df export to csv...")
    
    df.to_csv('export_lahaus_{}.dat'.format(timestamp), sep='|', index=False, encoding='utf-8-sig')
    
    print("Export to csv completed.")
    
    return

In [203]:
def build_df(df,wt):
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
    response = requests.get("https://www.lahaus.mx/venta/propiedades/area-metropolitana-monterrey",headers=headers)
    content = response.content
    soup = BeautifulSoup(content,"html.parser")
    
    num_pages = get_pages(soup)
                
    for i in range(num_pages):
        
        if i > 0:
            soup = get_soup(i,wt)
        else:
            print("Retrieving content from page: ",i+1)
        
        #lahaus_res = soup.find_all("article",attrs={"class":"property-card-container border mb-lh-24 md:flex-row rounded-lg w-full cursor-pointer new-property border-lh-md-gray"})
        lahaus_res = soup.find_all("article",attrs={"class":"property-card-container"})

        for res in lahaus_res:
            cat = 0
            categories = []
            
            categories,cat = get_categories(res,cat)
            categories_con = "; ".join([str(k).strip() for k in categories])
            date = datetime.date.today()

            if cat == 1:
                
                name = (res.find("a").contents[0]).strip()
                loc = (res.find("span",attrs={'class':'label-r-12 order-2 text-lh-green-gray mt-6 mb-4'}).text).strip()
                desc_t = (res.find("p",attrs={'class':'order-4 text-12 font-regular text-lh-dark-deep-blue mb-3 leading-7'}).text).strip()
                desc = re.sub(r'\n *',' ',desc_t)
                ammenities = (res.find("span",attrs={'class':'label-r-12 order-5 text-lh-green-gray mb-8'}).text).strip()
                price_range = (res.find("p",attrs={'class':'order-6 text-14 font-bold text-lh-dark-deep-blue'}).text).strip()
                
                for u,unit in enumerate(res.find_all("li", attrs={'class':'p-lh-12 mr-lh-12 border rounded-lg border-lh-md-gray hover:bg-lh-extra-light-blue'})):

                    unit_area = (unit.find("p",attrs={'class':'text-14 text-lh-dark-deep-blue text-lh-green-gray'}).text).strip()
                    unit_price = (unit.find("p",attrs={'class':'small-r-14 text-lh-jungle-green mt-4'}).text).strip()

                    for details in unit.find_all("span",attrs={'class':'mr-lh-16'}):                
                        
                        #print(details)
                        if (details.find("i",attrs={'class':'fa fa-real-estate-bedroom'})) is not None:
                            bedrooms = (details.text).strip()
                        if (details.find("i",attrs={'class':'fa fa-real-estate-bathroom'})) is not None:
                            bathrooms = (details.text).strip()
                        if (details.find("i",attrs={'class':'fa fa-real-estate-parking'})) is not None:
                            parking_spots = (details.text).strip()
                        else:
                            parking_spots = 0
                    
                    comments = ''
                    #print([loc,name,desc,ammenities,categories_con,price_range,u+1,unit_area,unit_price,bedrooms,bathrooms,parking_spots])
                    list_cols = ['loc','name','desc','ammenities','categories','price_range','unit','unit_area','unit_price','bedrooms','bathrooms','parking_spots','comments','date']
                    newrow_list = [loc,name,desc,ammenities,categories_con,price_range,u+1,unit_area,unit_price,bedrooms,bathrooms,parking_spots,comments,date]            

                    df = merge_df(create_newrow(list_cols,newrow_list),df)
                    if len(df) % 100 == 0:
                        print("Current record count: {}".format(len(df)))
                        
            elif cat == 2:
                
                desc_t = (res.find("a").contents[0]).strip()
                desc = re.sub(r'\n *',' ',desc_t)
                #print(desc)
                loc = (res.find("span",attrs={'class':'label-r-12 order-2 text-lh-green-gray mt-lh-16 mb-lh-8 mx-lh-16'}).text).strip()
                ammenities = (res.find("span",attrs={'class':'order-4 label-r-12 order-2 text-lh-green-gray mx-lh-16'}).text).strip()
                
                if (res.find("p",attrs={'class':'order-4 text-14 text-lh-dark-deep-blue mt-lh-24 mb-lh-32 ml-lh-16 lh-font-weight-4'})) is not None:                 
                    unit_price = (res.find("p",attrs={'class':'order-4 text-14 text-lh-dark-deep-blue mt-lh-24 mb-lh-32 ml-lh-16 lh-font-weight-4'}).text).strip()
                else:
                    div = res.find("div",attrs={'class':'text-14 text-lh-dark-deep-blue pl-lh-24 md:pl-0'})
                    unit_price = (div.find("span",attrs={'class':'text-success-800'}).text).strip()
                
                for attr in res.find_all("span",attrs = {'class':'mr-lh-24 lg:mr-lh-12 xl:mr-lh-24 text-lh-green-gray'}):
                    
                    used_details = attr.find("span",attrs={'class':'mr-lh-24 lg:mr-lh-12 xl:mr-lh-24 text-lh-green-gray'})
                    #print(attr)
                    
                    if attr.find("i",attrs={'class':'fa fa-real-estate-apartment color-silver'}) is not None:
                        comments = (attr.text).strip()
                    if attr.find("i",attrs={'class':'fa fa-real-estate-bedroom color-silver'}) is not None:
                        bedrooms = (attr.text).strip()
                    if attr.find("i",attrs={'class':'fa fa-real-estate-bathroom color-silver'}) is not None:
                        bathrooms = (attr.text).strip()
                    if attr.find("i",attrs={'class':'fa fa-area-outline color-silver'}) is not None:
                        unit_area = (attr.text).strip()
                    if attr.find("i",attrs={'class':'fa fa-real-estate-parking color-silver'}) is not None:
                        parking_spots = (attr.text).strip()
                    else:
                        parking_spots = 0
                    
                list_cols = ['loc','name','desc','ammenities','categories','price_range','unit','unit_area','unit_price','bedrooms','bathrooms','parking_spots','comments','date']
                newrow_list = [loc,'Propiedad Usada',desc,ammenities,categories_con,'',1,unit_area,unit_price,bedrooms,bathrooms,parking_spots,comments,date]            

                df = merge_df(create_newrow(list_cols,newrow_list),df)
                if len(df) % 100 == 0:
                    print("Current record count: {}".format(len(df)))
                
            elif cat == 3:
                
                loc = (res.find("p",attrs={'class':'order-3 text-14 font-regular text-lh-green-gray mb-lh-16'}).text).strip()
                price_range = (res.find("p",attrs={'class':'order-4 text-18 font-regular text-lh-jungle-green'}).text).strip()
                
                list_cols = ['loc','name','desc','ammenities','categories','price_range','unit','unit_area','unit_price','bedrooms','bathrooms','parking_spots','comments','date']
                newrow_list = [loc,name,'Lanzamiento LA HAUS','',categories_con,price_range,1,'',unit_price,'',bathrooms,'','',date]
                
                
        print("Completed page number {} of {}".format(i+1,num_pages))
           
    build_csv_export(df)
    
    print("Job ended")
    
    return df


In [None]:
pd.reset_option("display.max_rows")
#pd.set_option("display.max_rows",None)

In [204]:
df_lahaus = pd.DataFrame(columns = ['loc','name','desc','ammenities','categories','price_range','unit','unit_area','unit_price','bedrooms','bathrooms','parking_spots','comments','date'])

df_lahaus = build_df(df_lahaus,5)

Retrieving content from page:  1
Completed page number 1 of 46
Retrieving content from page:  2
Current record count: 100
Completed page number 2 of 46
Retrieving content from page:  3
Current record count: 200
Completed page number 3 of 46
Retrieving content from page:  4
Completed page number 4 of 46
Retrieving content from page:  5
Completed page number 5 of 46
Retrieving content from page:  6
Current record count: 300
Completed page number 6 of 46
Retrieving content from page:  7
Completed page number 7 of 46
Retrieving content from page:  8
Completed page number 8 of 46
Retrieving content from page:  9
Completed page number 9 of 46
Retrieving content from page:  10
Completed page number 10 of 46
Retrieving content from page:  11
Current record count: 400
Completed page number 11 of 46
Retrieving content from page:  12
Completed page number 12 of 46
Retrieving content from page:  13
Completed page number 13 of 46
Retrieving content from page:  14
Completed page number 14 of 46
Retr

In [None]:
build_csv_export(df_lahaus)

In [None]:
df_lahaus



In [None]:
for i in df_lahaus.columns:
    try:
        df_lahaus[i]=df_lahaus[i].str.strip(r'\n')
    except:
        print(i)
        continue
    

In [None]:
df_lahaus_bkup = df_lahaus.copy()

In [None]:
df_lahaus['unit_area']=df_lahaus['unit_area'].str.strip(' ㎡')
df_lahaus['unit_area']=df_lahaus['unit_area'].astype(float)