### Extract Information from Websites (use any 100 websites)
#### Objective:
Develop a solution to extract the following information from a list of websites:

Social Media Links,
Tech Stack (MVC, CMS, JS type etc),
Meta Title,
Meta Description,
Payment Gateways (e.g., PayPal, Stripe, Razorpay),
Website language,
Category of website.

Author : Anuroop Arya

In [1]:
!pip install mysql-connector-python
!pip install requests beautifulsoup4
!pip install langdetect



In [2]:
import mysql.connector
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from langdetect import detect, DetectorFactory

# To Ensure deterministic results
DetectorFactory.seed = 0

# Function : Fetch HTML content from a URL
def fetch_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    return response.text

# Function: Extract social media links
def extract_social_media_links(soup):
    social_media_links = {}

    social_media_patterns = {
        'facebook': r"facebook.com/[\w\d.]+",
        'twitter': r"twitter.com/[\w\d.]+",
        'linkedin': r"linkedin.com/[\w\d.-]+"
    }

    for platform, pattern in social_media_patterns.items():
        links = soup.find_all("a", href=re.compile(pattern, re.IGNORECASE))
        if links:
            social_media_links[platform] = [link['href'] for link in links]

    return social_media_links

# Function : Extract tech stack information
def extract_tech_stack(soup):
    tech_stack = []

    tech_stack_patterns = {
        'MVC': r"(mvc|model-view-controller)",
        'CMS': r"(cms|content management system|wordpress|joomla|drupal)",
        'JavaScript': r"(javascript|js)"
    }

    for tech, pattern in tech_stack_patterns.items():
        found = soup.find(string=re.compile(pattern, re.IGNORECASE))
        if found:
            tech_stack.append(tech)

    return ', '.join(tech_stack)

# Function : Extract meta tags
def extract_meta_tags(soup):
    meta_tags = {}

    title = soup.find('title')
    meta_tags['title'] = title.text.strip() if title else ''

    description = soup.find('meta', attrs={'name': 'description'})
    meta_tags['description'] = description['content'].strip() if description and 'content' in description.attrs else ''

    return meta_tags

# Function : Extract payment gateways
def extract_payment_gateways(soup):
    payment_gateways = []

    payment_gateway_patterns = {
        'PayPal': r"paypal.com",
        'Stripe': r"stripe.com",
        'Razorpay': r"razorpay.com"
    }

    for gateway, pattern in payment_gateway_patterns.items():
        found = soup.find("a", href=re.compile(pattern, re.IGNORECASE))
        if found:
            payment_gateways.append(gateway)

    return ', '.join(payment_gateways)

# Function : Extract website language
def extract_website_language(soup):
    texts = soup.stripped_strings
    text_content = ' '.join(texts)

    try:
        language = detect(text_content)
    except:
        language = 'unknown'

    return language

# Function : Extract website category
def extract_website_category(soup):
    category_keywords = {
        'News': ['news', 'journal', 'media', 'bbc', 'cnn', 'reuters', 'nytimes'],
        'E-commerce': ['shop', 'store', 'buy', 'cart', 'product', 'sale', 'ebay', 'amazon', 'etsy', 'alibaba'],
        'Social': ['social', 'network', 'community', 'facebook', 'twitter', 'linkedin', 'instagram'],
        'Technology': ['tech', 'software', 'hardware', 'gadgets', 'github', 'stackoverflow', 'digitalocean'],
        'Education': ['education', 'learning', 'school', 'university', 'college', 'udemy', 'coursera'],
        'Finance': ['finance', 'bank', 'investment', 'money', 'loan', 'bankofamerica', 'chase', 'wellsfargo'],
        'Health': ['health', 'medical', 'doctor', 'clinic', 'hospital', 'webmd', 'mayoclinic'],
        'Entertainment': ['entertainment', 'movies', 'music', 'games', 'imdb', 'rottentomatoes', 'metacritic'],
        'Real Estate': ['real estate', 'property', 'homes', 'rent', 'buy', 'zillow', 'realtor'],
        'Travel': ['travel', 'trip', 'flight', 'hotel', 'expedia', 'airbnb', 'tripadvisor'],
        'Sports': ['sports', 'football', 'basketball', 'soccer', 'nba', 'nfl', 'espn'],
        'Food': ['food', 'recipe', 'cooking', 'restaurant', 'yelp', 'opentable'],
        'Automotive': ['automotive', 'car', 'vehicle', 'auto', 'cars.com', 'autotrader'],
        'Government': ['government', 'official', 'politics', 'whitehouse', 'gov'],
        'Fashion': ['fashion', 'clothing', 'style', 'shopbop', 'farfetch', 'asos'],
        'Art & Design': ['art', 'design', 'creative', 'behance', 'dribbble'],
        'Pets': ['pets', 'animals', 'dog', 'cat', 'petfinder', 'aspca'],
        'Books & Literature': ['books', 'literature', 'read', 'goodreads', 'bookbub'],
        'Science': ['science', 'research', 'scientific', 'nature', 'sciencedirect'],
        'Music': ['music', 'songs', 'band', 'spotify', 'soundcloud'],
        'Environment': ['environment', 'climate', 'green', 'earth', 'nationalgeographic'],
        'Fitness': ['fitness', 'exercise', 'workout', 'fitbit', 'myfitnesspal'],
        'Job Search': ['job', 'career', 'employment', 'linkedin', 'indeed', 'glassdoor'],
        'Art & Culture': ['art', 'culture', 'museum', 'heritage', 'artsy'],
        'DIY & Crafts': ['diy', 'crafts', 'handmade', 'instructables', 'craftsy'],
        'Lifestyle': ['lifestyle', 'wellness', 'selfcare', 'mindfulness', 'thriveglobal'],
        'Legal': ['legal', 'law', 'attorney', 'legalzoom', 'findlaw'],
        'Religion': ['religion', 'faith', 'spirituality', 'church', 'bible'],
        'Personal Finance': ['personal finance', 'money management', 'budgeting', 'mint', 'personalfinance'],
        'Photography': ['photography', 'photo', 'picture', 'flickr', '500px'],
        'Gaming': ['gaming', 'video games', 'gamer', 'ign', 'gamespot'],
        'Cooking': ['cooking', 'recipes', 'food', 'chef', 'foodnetwork'],
        'Tech Support': ['tech support', 'it support', 'computer help', 'geek squad', 'techsupport'],
        'Weather': ['weather', 'forecast', 'meteorology', 'accuweather', 'weather.com'],
        'Charity & Causes': ['charity', 'nonprofit', 'donate', 'charitynavigator', 'gofundme'],
        'Home & Garden': ['home', 'garden', 'house', 'gardening', 'hgtv'],
        'Outdoor Activities': ['outdoor', 'camping', 'hiking', 'rei'],
        'Business': ['business', 'entrepreneurship', 'startup', 'forbes'],
        'Politics': ['politics', 'government', 'political', 'election', 'politico'],
        'History': ['history', 'historical', 'historian', 'history.com'],
        'Home Improvement': ['home improvement', 'diy', 'interior design', 'lowe\'s'],
        'Insurance': ['insurance', 'health insurance', 'auto insurance', 'geico'],
        'Entertainment News': ['entertainment news', 'celebrity news', 'tmz', 'eonline'],
        'Medical': ['medical', 'healthcare', 'hospital', 'medscape'],
        'Fitness & Exercise': ['fitness', 'exercise', 'workout', 'fitnessmagazine'],
        'Technology & Gadgets': ['technology', 'gadgets', 'tech', 'engadget'],
        'Pets & Animals': ['pets', 'animals', 'dog', 'cat', 'petfinder'],
        'Art & Design': ['art', 'design', 'creative', 'artstation', 'dribbble'],
        'Books & Reading': ['books', 'reading', 'literature', 'goodreads', 'bookbub'],
        'Fashion & Style': ['fashion', 'style', 'clothing', 'fashionista'],
        'Food & Cooking': ['food', 'cooking', 'recipes', 'epicurious'],
        'Automotive & Vehicles': ['automotive', 'vehicles', 'car', 'truck', 'autotrader'],
        'Travel & Tourism': ['travel', 'tourism', 'vacation', 'tripadvisor'],
        'Sports & Fitness': ['sports', 'fitness', 'athletics', 'runnersworld'],
        'Music & Entertainment': ['music', 'entertainment', 'movies', 'spotify'],
        'Education & Learning': ['education', 'learning', 'school', 'college'],
        'Tech & Gadgets': ['tech', 'gadgets', 'technology', 'hardware', 'software'],
        'Health & Wellness': ['health', 'wellness', 'medical', 'fitness'],
        'Home & Decor': ['home', 'decor', 'interior design', 'apartment therapy'],
        'Arts & Culture': ['arts', 'culture', 'museum', 'theatre'],
        'Business & Finance': ['business', 'finance', 'investment', 'entrepreneurship'],
        'Science & Nature': ['science', 'nature', 'biology', 'environment'],
        'Family & Parenting': ['family', 'parenting', 'kids', 'children'],
        'Hobbies & Interests': ['hobbies', 'interests', 'crafts', 'diy'],
        'News & Media': ['news', 'media', 'press', 'journalism']
    }
    texts = ' '.join(soup.stripped_strings).lower()
    for category, keywords in category_keywords.items():
        for keyword in keywords:
            if keyword in texts:
                return category

    return 'unknown'


# Function : Insert data into MySQL database
def insert_into_database(url, social_media_links, tech_stack, meta_title, meta_description, payment_gateways, website_language, website_category, cursor):
    try:
        sql = "INSERT INTO websites (url, social_media_links, tech_stack, meta_title, meta_description, payment_gateways, website_language, website_category) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        values = (
            url, 
            ', '.join([f"{k}: {', '.join(v)}" for k, v in social_media_links.items()]) if social_media_links else '',
            tech_stack,
            meta_title,
            meta_description,
            payment_gateways,
            website_language,
            website_category
        )
        cursor.execute(sql, values)
        print(f"Data inserted successfully for {url}")

    except mysql.connector.Error as error:
        print(f"Failed to insert data into MySQL table: {error}")

# List of Websites to scrape
urls = [
    "https://www.google.com", "https://www.facebook.com", "https://www.youtube.com",
    "https://www.amazon.com", "https://www.wikipedia.org", "https://www.reddit.com",
    "https://www.twitter.com", "https://www.linkedin.com", "https://www.instagram.com",
    "https://www.pinterest.com", "https://www.netflix.com", "https://www.apple.com",
    "https://www.microsoft.com", "https://www.bbc.com/news", "https://www.cnn.com",
    "https://www.nytimes.com", "https://www.theguardian.com", "https://www.huffpost.com",
    "https://www.foxnews.com", "https://www.nbcnews.com", "https://www.ebay.com",
    "https://www.bing.com", "https://www.yahoo.com", "https://www.quora.com",
    "https://www.tumblr.com", "https://www.medium.com", "https://www.vimeo.com",
    "https://www.dailymotion.com", "https://www.soundcloud.com", "https://www.spotify.com",
    "https://www.imdb.com", "https://www.rottentomatoes.com", "https://www.metacritic.com",
    "https://www.tripadvisor.com", "https://www.expedia.com", "https://www.kayak.com",
    "https://www.airbnb.com", "https://www.booking.com", "https://www.zillow.com",
    "https://www.realtor.com", "https://www.walmart.com", "https://www.target.com",
    "https://www.bestbuy.com", "https://www.lowes.com", "https://www.etsy.com",
    "https://www.shopify.com", "https://www.alibaba.com", "https://www.udemy.com",
    "https://www.coursera.org", "https://www.edx.org", "https://www.khanacademy.org",
    "https://www.duolingo.com", "https://www.codeacademy.com", "https://www.stackoverflow.com",
    "https://www.github.com", "https://www.bitbucket.org", "https://www.digitalocean.com",
    "https://www.vercel.com", "https://www.cloudflare.com", "https://www.slack.com",
    "https://www.zoom.us", "https://www.skype.com", "https://www.netlify.com",
    "https://www.discord.com", "https://www.trello.com", "https://www.asana.com",
    "https://www.jira.com", "https://www.dropbox.com", "https://www.box.com",
    "https://www.googledrive.com", "https://www.onedrive.com", "https://www.heroku.com",
    "https://www.icloud.com", "https://www.paypal.com", "https://www.stripe.com",
    "https://www.venmo.com", "https://www.squareup.com", "https://www.coinbase.com",
    "https://www.robinhood.com", "https://www.justwatch.com", "https://www.wealthfront.com",
    "https://www.acorns.com", "https://www.betterment.com", "https://www.mint.com",
    "https://www.personalcapital.com", "https://www.bankofamerica.com", "https://www.chase.com",
    "https://www.wellsfargo.com", "https://www.citibank.com", "https://www.capitalone.com",
    "https://www.americanexpress.com", "https://www.discover.com", "https://www.synchrony.com","https://netlify.com",
    "https://toffeeshare.com", "https://stackoverflow.com","https://github.com", "https://bitbucket.or","https://digitalocean.com",
]

# Checking the number of URLs
print(len(urls))


# Database connection
db_config = {
    'user': 'Anuroop',
    'password': 'Aryaanurag1@2',
    'host': 'localhost',
    'database': 'website_info',
    'port': '3306'
}

try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()

    # Clean up existing data
    delete_query = "DELETE FROM websites"
    cursor.execute(delete_query)
    print("Existing data deleted successfully.")

    # Insert new data
    for url in urls:
        try:
            html_content = fetch_html(url)
            soup = BeautifulSoup(html_content, 'html.parser')

            social_media_links = extract_social_media_links(soup)
            tech_stack = extract_tech_stack(soup)
            meta_tags = extract_meta_tags(soup)
            meta_title = meta_tags['title']
            meta_description = meta_tags['description']
            payment_gateways = extract_payment_gateways(soup)
            website_language = extract_website_language(soup)
            website_category = extract_website_category(soup)

            insert_into_database(url, social_media_links, tech_stack, meta_title, meta_description, payment_gateways, website_language, website_category, cursor)

        except Exception as e:
            print(f"Failed to process {url}: {str(e)}")

    connection.commit()
    print("Data insertion completed.")

except mysql.connector.Error as error:
    print(f"Error connecting to MySQL: {error}")

finally:
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection is closed.")


99
Existing data deleted successfully.
Data inserted successfully for https://www.google.com
Data inserted successfully for https://www.facebook.com
Data inserted successfully for https://www.youtube.com
Data inserted successfully for https://www.amazon.com
Data inserted successfully for https://www.wikipedia.org
Data inserted successfully for https://www.reddit.com
Data inserted successfully for https://www.twitter.com
Data inserted successfully for https://www.linkedin.com
Data inserted successfully for https://www.instagram.com
Data inserted successfully for https://www.pinterest.com
Data inserted successfully for https://www.netflix.com
Data inserted successfully for https://www.apple.com
Data inserted successfully for https://www.microsoft.com
Data inserted successfully for https://www.bbc.com/news
Data inserted successfully for https://www.cnn.com
Data inserted successfully for https://www.nytimes.com
Data inserted successfully for https://www.theguardian.com
Data inserted succes

In [3]:
import mysql.connector

# Database connection parameters
db_config = {
    'user': 'XXXXXX',                  # Replace with your database username
    'password': 'XXXXXXXX',        # Replace with your database password
    'host': 'localhost',                # Replace with your database host
    'database': 'website_info',         # Replace with your database name
    'port': '3306'                      # Replace with your database port
}

# Connecting to the MySQL database
try:
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print('Connected to MySQL database')
except mysql.connector.Error as error:
    print(f'Error: {error}')

# Creating a cursor object
cursor = connection.cursor()

# Query to fetch all rows from the websites table
query = "SELECT * FROM websites"
cursor.execute(query)

# Fetching all the rows
records = cursor.fetchall()

# Display fetched records
for record in records:
    print(record)

# Close cursor and connection
cursor.close()
connection.close()

Connected to MySQL database
(1061, 'https://www.google.com', '', 'JavaScript', 'Google', '', '', 'en', 'Gaming')
(1062, 'https://www.facebook.com', 'facebook: https://www.facebook.com/recover/initiate?lwv=110&ars=royal_blue_bar, https://www.facebook.com/watch/, https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.instagram.com%2F&h=AT3T9Jadiadc6cRQHP70S9Q5HqEHn24JZ08vtmTphRiJVCA_edcb8nkOacHQJgttA7BvHUbKU2cXy1EdgdGS1nNG1vlKrZo-oL6JTbtdcKFq68Am82TvTHypGE_8Wg_T5HX9OoWhm-uX791xIJa5j8ccAHLAxFs2nn7SwA, https://www.facebook.com/help/568137493302217', 'JavaScript', 'Facebook', '', '', 'hi', 'Social')
(1063, 'https://www.youtube.com', '', 'JavaScript', 'YouTube', 'Enjoy the videos and music you love, upload original content, and share it all with friends, family, and the world on YouTube.', '', 'en', 'News & Media')
(1064, 'https://www.amazon.com', '', 'JavaScript', 'Amazon.com', '', '', 'en', 'E-commerce')
(1065, 'https://www.wikipedia.org', '', 'JavaScript', 'Wikipedia', 'Wikipedia is a free onli

In [4]:
import pandas as pd

# Convert records to DataFrame
df = pd.DataFrame(records, columns=['id', 'url', 'social_media_links', 'tech_stack', 'meta_title', 'meta_description', 'payment_gateways', 'website_language', 'website_category'])

df

Unnamed: 0,id,url,social_media_links,tech_stack,meta_title,meta_description,payment_gateways,website_language,website_category
0,1061,https://www.google.com,,JavaScript,Google,,,en,Gaming
1,1062,https://www.facebook.com,facebook: https://www.facebook.com/recover/ini...,JavaScript,Facebook,,,hi,Social
2,1063,https://www.youtube.com,,JavaScript,YouTube,"Enjoy the videos and music you love, upload or...",,en,News & Media
3,1064,https://www.amazon.com,,JavaScript,Amazon.com,,,en,E-commerce
4,1065,https://www.wikipedia.org,,JavaScript,Wikipedia,"Wikipedia is a free online encyclopedia, creat...",,pt,News
...,...,...,...,...,...,...,...,...,...
93,1154,https://netlify.com,"twitter: https://twitter.com/netlify, linkedin...","CMS, JavaScript",Scale & Ship Faster with a Composable Web Arch...,"Realize the speed, agility and performance of ...",,en,News
94,1155,https://toffeeshare.com,,JavaScript,"Share files privately, fast and without size l...",Send your files privately and fast. No file si...,,en,E-commerce
95,1156,https://stackoverflow.com,facebook: https://www.facebook.com/officialsta...,JavaScript,"Stack Overflow - Where Developers Learn, Share...","Stack Overflow is the largest, most trusted on...",,en,E-commerce
96,1157,https://github.com,"facebook: https://www.facebook.com/GitHub, lin...",JavaScript,GitHub: Let’s build from here · GitHub,GitHub is where over 100 million developers sh...,,en,News
