In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

# Function to scrape the link
def scrapelink(link):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        response = requests.get(link, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except requests.exceptions.RequestException as e:
        if hasattr(response, 'status_code') and response.status_code == 429:
            print(f"Rate limit hit for URL: {link} | Switching search engine...")
            return 'rate_limit'
        else:
            print(f"Error fetching the URL: {link} | Error: {e}")
        return None

# Function to extract contact information from the soup
def extract_contact_info(soup):
    contact_info = {
        'email': None,
        'phone': None
    }
    
    if soup is None or soup == 'rate_limit':
        return contact_info
    
    try:
        # Extract email addresses
        email_regex = r'[\w\.-]+@[\w\.-]+\.\w{2,4}'
        email_matches = re.findall(email_regex, soup.text)
        if email_matches:
            contact_info['email'] = email_matches[0]
        
    except Exception as e:
        print(f"Error extracting contact info: {e}")
        
    return contact_info

# Load your DataFrame
df_exportadores = pd.read_csv('/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/docs/reports/agrupacion_exportadores.csv')  # Assuming you have a CSV file

# Append new columns to store the scraped data if they don't already exist
for column in ['email', 'phone']:
    if column not in df_exportadores.columns:
        df_exportadores[column] = None

# Iterate over the keywords and scrape data in batches of 5 to fill email column first
search_engines = [
    'https://www.google.com/search?q={}',
    'https://search.yahoo.com/search?p={}',
    'https://www.bing.com/search?q={}',
    'https://duckduckgo.com/?q={}']

search_engine_status = {engine: True for engine in search_engines}

for i in range(0, len(df_exportadores), 5):
    batch = df_exportadores.iloc[i:i+5]
    
    for index, row in batch.iterrows():
        if pd.notna(row['email']):
            # Skip rows where email is already present
            continue
        
        keyword = row['probable_exportador'].replace('_', ' ')
        
        try:
            success = False
            
            for search_engine in search_engines:
                if not search_engine_status[search_engine]:
                    # Skip search engines that have hit the rate limit
                    continue
                
                if success:
                    break
                
                search_link = search_engine.format(keyword)
                soup = scrapelink(search_link)
                
                if soup == 'rate_limit':
                    search_engine_status[search_engine] = False
                    continue
                
                if soup:
                    contact_info = extract_contact_info(soup)
                    if contact_info['email']:
                        # Update DataFrame with the scraped email
                        df_exportadores.at[index, 'email'] = contact_info['email']
                        success = True
                        
            # Sleep to avoid being blocked by search engines
            time.sleep(random.uniform(10, 15))
        except Exception as e:
            print(f"Error processing keyword '{keyword}': {e}")

# Save the updated DataFrame to a new CSV file
df_exportadores.to_csv('exportadores_updated.csv', index=False)

print("Scraping completed and data saved to 'exportadores_updated.csv'")

# Now use the filled dataset to search for phone numbers
for index, row in df_exportadores.iterrows():
    if pd.isna(row['email']) or pd.notna(row['phone']):
        # Skip rows where email is missing or phone is already present
        continue
    
    keyword = row['email']
    
    try:
        success = False
        
        for search_engine in search_engines:
            if not search_engine_status[search_engine]:
                # Skip search engines that have hit the rate limit
                continue
            
            if success:
                break
            
            search_link = search_engine.format(keyword)
            soup = scrapelink(search_link)
            
            if soup == 'rate_limit':
                search_engine_status[search_engine] = False
                continue
            
            if soup:
                # Extract phone numbers (start with 9 and length of 9 digits)
                phone_regex = r'\+?593?\s?\(?\d{2}\)?[\s.-]?\d{3}[\s.-]?\d{4}'
                phone_matches = re.findall(phone_regex, soup.text)
                for phone in phone_matches:
                    phone_numbers_only = re.sub(r'\D', '', phone)
                    if len(phone_numbers_only) == 9 and phone_numbers_only.startswith('9'):
                        df_exportadores.at[index, 'phone'] = phone_numbers_only
                        success = True
                        break
        
        # Sleep to avoid being blocked by search engines
        time.sleep(random.uniform(10, 15))
    except Exception as e:
        print(f"Error processing keyword '{keyword}': {e}")

# Save the updated DataFrame to a new CSV file
df_exportadores.to_csv('exportadores_updated.csv', index=False)

print("Phone scraping completed and data saved to 'exportadores_updated.csv'")


Rate limit hit for URL: https://www.google.com/search?q=industrial pesquera santa priscila s.a. | Switching search engine...
Scraping completed and data saved to 'exportadores_updated.csv'
Phone scraping completed and data saved to 'exportadores_updated.csv'


In [16]:
df_scraped = pd.read_csv('/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/docs/reports/exportadores_updated.csv')

# Trim whitespaces from the entire dataframe, including numeric columns
df_scraped = df_scraped.map(lambda x: x.strip() if isinstance(x, str) else x)

print(df_scraped.head())
print(df_scraped.shape)

df_scraped.to_csv('/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/docs/reports/exportadores_updated.csv', index=False)

# Count the number of missing values in each column
missing_values = df_scraped.isnull().sum()

# Display the missing values in descending order
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

print(missing_values)


                                 probable_exportador  valor_total_exportado  \
0            industrial_pesquera_santa_priscila_s.a.           8.459977e+09   
1  operadora_y_procesadora_de_productos_marinos_o...           4.762433e+09   
2                sociedad_nacional_de_galapagos_c.a.           3.024318e+09   
3                expalsa_exportadora_de_alimentos_sa           2.203737e+09   
4                                    promarisco_s.a.           1.613648e+09   

   peso_total_mercancia  cantidad_total_unidades pais_mas_frecuente producto  \
0          9.120026e+08               1147696108              china  camaron   
1          5.943874e+08                722805761              china  camaron   
2          4.561799e+08                529295744      united_states  camaron   
3          3.576516e+08                482192051              china  camaron   
4          2.563640e+08                284210200              spain  camaron   

  email  phone  
0   NaN    NaN  
1   NaN   