## Required Libraries 

In [1]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import requests
import warnings
from fake_useragent import UserAgent
import warnings

## Webscraper Class Creation

In [2]:
class Webscraper():
    
    def __init__(self, urls):
        self.data = None
        self.urls = list(urls)
        self.smm_dict = None
         
    def scrap_smm_info_from_pages(self):

        social_media_patterns = ['www.facebook.com', 'www.instagram.com', 'www.linkedin.com', 'www.twitter.com',
        'facebook.com', 'instagram.com', 'linkedin.com', 'twitter.com', 'x.com']

        ua = UserAgent()
        fake_headers = ua.chrome
        smm_dict = {}
        warnings.filterwarnings("ignore")
        websites_to_scrap = [x.lower() for x in self.urls]
        
        for smm_find in websites_to_scrap:
            smm_links = set()
            try:
                response2 = requests.get(smm_find, timeout=8, headers={'User-Agent' : fake_headers}, verify=False)
                soupy = BeautifulSoup(response2.text, 'html.parser')
                body = soupy.find('body')

                if body is not None:
                    href_values = [link.get('href') for link in body.find_all('a') if link.get('href')]

                    for href in href_values:
                        for pattern in social_media_patterns:
                            if pattern in href:
                                smm_links.add(href)
                                time.sleep(5)
            except Exception as e:
                print(f'Hi, {smm_find} seems to be invalid, we received the error {e}, we are going to the next one!')
                continue

            if smm_links is not None and len(smm_links) > 0:
                smm_dict[smm_find] = list(smm_links)
                print(f'We found social media for {smm_find}')
            else:
                print(f'No Social Media Accounts were found for {smm_find}.')


        self.smm_dict = smm_dict
        return smm_dict

    
    def create_df_cleaned_from_smm(self, smm_dict):
        df_of_smm_opt = pd.DataFrame(smm_dict.items(), columns=['Website', 'Social Media Links'])

        platforms = ['Facebook', 'Instagram', 'LinkedIn', 'Twitter']
        for platform in platforms:
            df_of_smm_opt[platform] = df_of_smm_opt['Social Media Links'].apply(
                lambda links: next((link for link in links if platform.lower() in link.lower()), None)
            )

        df_of_smm_opt = df_of_smm_opt.drop('Social Media Links', axis=1)
      
        return df_of_smm_opt

## Example Data and Example Use

In [None]:
example_data_df = pd.read_csv('example_data.csv')
urls = example_data_df['urls'].to_list()

scrapper_object = Webscraper(urls)

scrapping_websites_method = scrapper_object.scrap_smm_info_from_pages()
data_frame = scrapper_object.create_df_cleaned_from_smm(scrapping_websites_method)


In [12]:
data_frame.head(19)

Unnamed: 0,Website,Facebook,Instagram,LinkedIn,Twitter
0,http://familyfriendsvet.com,https://www.facebook.com/familyfriendsvet/,https://www.instagram.com/familyfriendsveterin...,,
1,http://www.westmichiganaeh.com,https://www.facebook.com/AEHGR,,,
2,http://willowrunvetservices.com,https://www.facebook.com/Willow-Run-Veterinary...,,,
3,http://beercitybread.com,https://www.facebook.com/BeerCityBread/,https://www.instagram.com/beercitybread,,
4,http://123gr.com,https://www.facebook.com/OneTwentyThreeGR/,https://www.instagram.com/onetwentythreegr/,,
5,http://oldgoatgr.com,https://www.facebook.com/OldGoatGR/,https://www.instagram.com/oldgoatgr/,,
6,http://thestraycafe.com,https://www.facebook.com/TheStrayGR/events/,https://www.instagram.com/thestraygr/,,
7,http://karkirestaurant.com,https://www.facebook.com/Karki-Restaurant-1057...,,,
8,http://chwinery.com,https://www.facebook.com/coopershawk,https://www.instagram.com/chwinery,,https://twitter.com/CHWinery
9,http://thegreenwell.com,https://www.facebook.com/thegreenwell,https://www.instagram.com/thegreenwell/,,
