In [1]:
#imports and definitions
import time
import pandas as pd
import requests
from random import randint
from bs4 import BeautifulSoup
from IPython.display import Markdown, display

def proxy_anonymity_test(proxy):
    #uses http://proxydb.net/anon
    #proxy must be exactly in this form: {'http': 'http://Logmech:AlphaBetaGamma@86.105.96.1:60099'}
    with requests.Session() as session:
        session.proxies.update(proxy)
        session.headers.update(headers)
        proxydb_html_page = session.get('http://proxydb.net/anon').content
    soup = BeautifulSoup(proxydb_html_page, 'html.parser')
    tags = soup.find('dl', class_='row').find_all('dd')
    ip = tags[0].text
    anonymity_level = tags[1].text
    country = tags[2].find('img')['alt']
    city = tags[3].text
    region = tags[4].text
    isp = tags[5].text
    printmd("**Proxy Anonymity Test Results:**")
    print("IP: " + ip)
    print("ANONYMITY-LEVEL: " + anonymity_level)
    print("COUNTRY: " + country)
    print("CITY: " + city)
    print("REGION: " + region)
    print("ISP: " + isp)
    printmd("**HTTP-Request Headers:**")
    tag = soup.find('pre')
    print(tag.text)
#----------------------------------------------------------------------------------------------------------------------------------

#Getter functions
#Returns the info on a listing page and also fills the dataframe's corresponding slots
def get_rooms_bedrooms_and_bathrooms(df, row_index, listing_page):
    #gets the number of rooms, bedrooms and bathrooms, if they are not specified returns None
    #returns a dictionary
    
    # rooms
    try:
        number_of_rooms = listing_page.find('div', class_='row teaser').find(class_="col-lg-3 col-sm-6 piece").text.strip()
    except AttributeError:
        number_of_rooms = None
    
    #bedrooms
    try :
        number_of_bedrooms = listing_page.find('div', class_='row teaser').find(class_="col-lg-3 col-sm-6 cac").text.strip()
    except AttributeError:
        number_of_bedrooms = None
        
    #bathrooms
    try:
        number_of_bathrooms = listing_page.find('div', class_='row teaser').find(class_="col-lg-3 col-sm-6 sdb").text.strip()
    except AttributeError:
        number_of_bathrooms = None
    
    df.loc[row_index, ['Rooms', 'Bedrooms', 'Bathrooms']] = [number_of_rooms, number_of_bedrooms, number_of_bathrooms]
    
    return {'Rooms' : number_of_rooms, 'Bedrooms': number_of_bedrooms, 'Bathrooms': number_of_bathrooms}
def get_other_features(df, row_index, listing_page):
    #gets the year built, additional features, pot gross revenue and zoning of listing, and three different types of areas.
    #Null if not specified returns a dict where the keys are the feature names and values are the feature values
    other_features = {}
    for feature_name in ['Year built', 'Additional features', 'Potential gross revenue', 'Zoning', 'Lot area', 'Use of property', 'Available area', 'Building area (at ground level)']:
        try:
            feature = listing_page.find('div', class_='carac-title', text=feature_name).next_sibling.next_sibling.text
        except AttributeError:
            feature = None
        df.loc[row_index, feature_name] = feature
        other_features[feature_name] = feature
    return other_features
def get_description(df, row_index, listing_page):
    try:
        description = listing_page.find('div', itemprop='description').text.strip()
    except AttributeError:
        description = None
    df.loc[row_index, 'Description'] = description
    return description
#----------------------------------------------------------------------------------------------------------------------------------

#Uses the given proxy to complete the given dataframe by calling the getter functions defined above
#Returns the completed dataframe and prints all the info's returned by the getter functions
def complete_df(df, proxy=None):
    with requests.Session() as session:
        if proxy is not None:
            session.proxies.update(proxy)
        session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'})
        print('Proxy: {}'.format(proxy))
        
        for row_index in df.index:
            listing_page = BeautifulSoup(session.get(df.loc[row_index, 'URL']).text, 'html.parser')
            
            print("\tRow: {}".format(row_index))
            
            #get the data and print it
            #rooms, bedrooms, bathrooms
            print('''\t\tNumber of rooms: {Rooms}
            \tNumber of bedrooms: {Bedrooms}
            \tNumber of bathrooms: {Bathrooms}'''.format(**get_rooms_bedrooms_and_bathrooms(df, row_index, listing_page)))
            
            #other_features
            print('''\t\tYear built: {Year built}
            \tPotential gross revenue: {Potential gross revenue}
            \tZoning: {Zoning}
            \tAdditional features: {Additional features}
            \tUse of property: {Use of property}
            \tBuilding area (at ground level): {Building area (at ground level)}
            \tAvailable area: {Available area}
            \tLot area: {Lot area}'''.format(**get_other_features(df, row_index, listing_page)))
            
            #description
            print("\n\t\tDescription: {}".format(get_description(df, row_index, listing_page)))  
            print("\n")
            time.sleep(randint(7,9))
    return df
#---------------------------------------------------------------------------------------------------------------------------------

#retreive the partition of the dataframe and the proxies 
%store -r partition_of_incomplete_df proxies
print(proxies[0])
partition_of_incomplete_df[0]

{'http': 'http://Logmech:AlphaBetaGamma@86.105.96.100:60099'}


Unnamed: 0,Category,Price,PPSF,PPSM,Street,City,URL,Rooms,Bedrooms,Bathrooms,Lot area,Year built,Additional features,Potential gross revenue,Zoning,Description,Use of property,Available area,Building area (at ground level)
0,Lot,280000.0,,,Chemin Édouard-Roy,Sherbrooke (Brompton/Rock Forest/Saint-Élie/De...,https://www.centris.ca/en/lots~for-sale~sherbr...,,,,,,,,,,,,
1,Lot,1695000.0,,,"108, Chemin de Gray Valley",Huberdeau,https://www.centris.ca/en/lots~for-sale~huberd...,,,,,,,,,,,,
2,Hobby farm,2399000.0,,,"21, Rang des Vents",Brébeuf,https://www.centris.ca/en/hobby-farms~for-sale...,,,,,,,,,,,,
3,Lot,,,399900.0,"60, Chemin Grant",Bouchette,https://www.centris.ca/en/lots~for-sale~bouche...,,,,,,,,,,,,
4,Cottage,1749000.0,,,"117, Chemin des Érables",Frelighsburg,https://www.centris.ca/en/cottages~for-sale~fr...,,,,,,,,,,,,
5,Hobby farm,199000.0,,,"30, Chemin du Portage",Rivière-Héva,https://www.centris.ca/en/hobby-farms~for-sale...,,,,,,,,,,,,
6,Lot,689000.0,,,Côte des Intrépides,Mont-Tremblant,https://www.centris.ca/en/lots~for-sale~mont-t...,,,,,,,,,,,,
7,Land,650000.0,,,Chemin Saint-Cyr Sud,Wentworth-Nord,https://www.centris.ca/en/land~for-sale~wentwo...,,,,,,,,,,,,
8,Land,3200000.0,,,Route 148,Grenville,https://www.centris.ca/en/land~for-sale~grenvi...,,,,,,,,,,,,
9,Land,139000.0,,,Chemin Owl's Landing,L'Isle-aux-Allumettes,https://www.centris.ca/en/land~for-sale~l-isle...,,,,,,,,,,,,


In [3]:
#complete the slice of the dataframe with the proxy with the same position in the list and store the completed slice
df0 = complete_df(partition_of_incomplete_df[0], proxies[0])
%store df0
df0

Proxy: {'http': 'http://Logmech:AlphaBetaGamma@86.105.96.100:60099'}
	Row: 0
		Number of rooms: None
            	Number of bedrooms: None
            	Number of bathrooms: None
		Year built: None
            	Potential gross revenue: None
            	Zoning: Residential
            	Additional features: Located on a stream
            	Use of property: None
            	Building area (at ground level): None
            	Available area: None
            	Lot area: 1,873,080 sqft

		Description: Rareté ,+- 43 acres situé à quelques minutes du centre urbain ,en grande partie boisé avec ruisseau .Site idéal pour implanter son projet de rêve ou comme investisseur pour méga projet pour le futur .Voir avec courtier inscripteur pour les usages permis.


	Row: 1
		Number of rooms: None
            	Number of bedrooms: None
            	Number of bathrooms: None
		Year built: None
            	Potential gross revenue: None
            	Zoning: Other, Forest, Residential, Agricultural
         

	Row: 12
		Number of rooms: None
            	Number of bedrooms: None
            	Number of bathrooms: None
		Year built: None
            	Potential gross revenue: None
            	Zoning: Residential
            	Additional features: Located on a pond, Non-navigable body of water
            	Use of property: None
            	Building area (at ground level): None
            	Available area: None
            	Lot area: 3,354,571 sqft

		Description: « TERRE UNIQUE À SAINT-CALIXTE » 91 arpents de plein air. Verger, arbres matures abritant les animaux RÉALISEZ VOTRE PROJET en harmonie avec la nature! Étang pour ensemencer la truite. Ruisseaux et sentiers vont de pair. INTIMITÉ, à l'abri des regards. SITE PRIVÉ. Sur rendez-vous seulement. Havre de paix. À 5 minutes du village. www.st-calixte.com


	Row: 13
		Number of rooms: None
            	Number of bedrooms: None
            	Number of bathrooms: None
		Year built: None
            	Potential gross revenue: None
            	Zon

	Row: 24
		Number of rooms: None
            	Number of bedrooms: None
            	Number of bathrooms: None
		Year built: None
            	Potential gross revenue: None
            	Zoning: Resort, Recreational and tourism, Residential
            	Additional features: Located on a lake, Non-navigable body of water
            	Use of property: None
            	Building area (at ground level): None
            	Available area: None
            	Lot area: 8,108,050 sqft

		Description: Terre avec lac à vendre, rare sur le marché. Possibilité d'y faire un beau domaine.


	Row: 25
		Number of rooms: None
            	Number of bedrooms: None
            	Number of bathrooms: None
		Year built: None
            	Potential gross revenue: None
            	Zoning: Resort, Residential
            	Additional features: Located on a lake, Navigable body of water
            	Use of property: None
            	Building area (at ground level): None
            	Available area: None
          

	Row: 36
		Number of rooms: 17 rooms
            	Number of bedrooms: 3 bedrooms
            	Number of bathrooms: 1 bathroom and 2 powder rooms
		Year built: Unknown age
            	Potential gross revenue: None
            	Zoning: None
            	Additional features: Located on a pond, Basement 6 feet or +
            	Use of property: None
            	Building area (at ground level): None
            	Available area: None
            	Lot area: 4,057,990 sqft

		Description: Fermette 93 acres, érablière(non-exploitée), prairie,  boisé. Ecurie avec 8 boxs et 2° pour entreposage. Garage pour machinerie. Maison plein sud en façade, ensoleillée, solarium avec SPA, belle fenestration, vue panoramique. Garage attaché. Lac artificiel et 3 ruisseaux. Situé dans cul-de-sac. La campagne à 12 min. de Sherbrooke et des services.


	Row: 37
		Number of rooms: 8 rooms
            	Number of bedrooms: 3 bedrooms
            	Number of bathrooms: 1 bathroom
		Year built: 1989
            	Pote

Unnamed: 0,Category,Price,PPSF,PPSM,Street,City,URL,Rooms,Bedrooms,Bathrooms,Lot area,Year built,Additional features,Potential gross revenue,Zoning,Description,Use of property,Available area,Building area (at ground level)
0,Lot,280000.0,,,Chemin Édouard-Roy,Sherbrooke (Brompton/Rock Forest/Saint-Élie/De...,https://www.centris.ca/en/lots~for-sale~sherbr...,,,,"1,873,080 sqft",,Located on a stream,,Residential,"Rareté ,+- 43 acres situé à quelques minutes d...",,,
1,Lot,1695000.0,,,"108, Chemin de Gray Valley",Huberdeau,https://www.centris.ca/en/lots~for-sale~huberd...,,,,"8,668,431 sqft",,"Located on a river, Non-navigable body of water",,"Other, Forest, Residential, Agricultural",GRAY VALLEY - Remarquable property of over 223...,,,
2,Hobby farm,2399000.0,,,"21, Rang des Vents",Brébeuf,https://www.centris.ca/en/hobby-farms~for-sale...,12 rooms,3 bedrooms,4 bathrooms,"3,907,403 sqft",Unknown age,Located on a river,,,"Fully renovated, this unique contemporary farm...",,,
3,Lot,,,399900.0,"60, Chemin Grant",Bouchette,https://www.centris.ca/en/lots~for-sale~bouche...,,,,"1,374,299 sqft",,"Located on a lake, Navigable body of water",,Residential,A wonderful piece of land on the majestic 31 M...,,,
4,Cottage,1749000.0,,,"117, Chemin des Érables",Frelighsburg,https://www.centris.ca/en/cottages~for-sale~fr...,13 rooms,3 bedrooms,2 bathrooms and 1 powder room,"2,073,456 sqft",Unknown age,"Located on a pond, Non-navigable body of water",,,"Follow your dream, Country Home. Frelighsburg ...",,,
5,Hobby farm,199000.0,,,"30, Chemin du Portage",Rivière-Héva,https://www.centris.ca/en/hobby-farms~for-sale...,10 rooms,2 bedrooms,1 bathroom,"6,048,135 sqft",1954,Located on a river,,,"Fermette de 138.8 acres, accès à la rivière Ca...",,,
6,Lot,689000.0,,,Côte des Intrépides,Mont-Tremblant,https://www.centris.ca/en/lots~for-sale~mont-t...,,,,"4,530,240 sqft",,"Resort, Located on a lake, Navigable body of w...",,"Resort, Residential",104 acres. 261.58 m of frontage on the edge of...,,,
7,Land,650000.0,,,Chemin Saint-Cyr Sud,Wentworth-Nord,https://www.centris.ca/en/land~for-sale~wentwo...,,,,"2,743,914 sqft",,"Located on a lake, Non-navigable body of water",,Residential,Magnificent 63-acre waterfront lot on the pres...,,,
8,Land,3200000.0,,,Route 148,Grenville,https://www.centris.ca/en/land~for-sale~grenvi...,,,,"12,022,560 sqft",,"Located on a river, Navigable body of water",,"Recreational and tourism, Residential",An exceptional 276-acre site awaits you to des...,,,
9,Land,139000.0,,,Chemin Owl's Landing,L'Isle-aux-Allumettes,https://www.centris.ca/en/land~for-sale~l-isle...,,,,"3,049,200 sqft",,Located on a river,,Residential,70 Acres with 477 Feet of Waterfront on the Ot...,,,
