#Elon Musk Tweet Scraper for PolitiTweet Data

In [None]:
pip install requests beautifulsoup4 pandas


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import os

# Base URL for Elon Musk's tweets on PolitiTweet
base_url = 'https://polititweet.org/tweets?account=44196397&page={}'

# Initialize an empty list to store tweet data
tweets_data = []

# Function to parse tweet date
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, 'Posted %B %d, %Y')
    except ValueError:
        return None

# Function to scrape a single page of tweets
def scrape_page(page):
    url = base_url.format(page)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    tweets = soup.find_all('div', class_='columns is-mobile')

    page_data = []
    for tweet in tweets:
        try:
            # Extract tweet text and remove additional tags or text
            tweet_text = tweet.find('p', class_='small-top-margin').text.strip()
            tweet_text = tweet_text.split('—')[0].strip()  # Remove PolitiTweet.org text

            # Extract tweet date
            date_tag = tweet.find('span', class_='tag is-white is-paddingless')
            tweet_date_str = date_tag.text.strip() if date_tag else "Unknown"
            tweet_date = parse_date(tweet_date_str)

            # Append the tweet data to the list
            page_data.append({
                'id': None,  # ID not available on the website
                'user_name': 'Elon Musk',
                'user_location': '',
                'user_description': 'Mars & Cars, Chips & Dips',
                'user_created': '2009-06-02 20:12:29+00:00',
                'user_followers': '',  # Static or unavailable on PolitiTweet
                'user_friends': '',
                'user_favourites': '',
                'user_verified': True,
                'date': tweet_date,
                'text': tweet_text,
                'hashtags': '',
                'source': 'PolitiTweet',
                'retweets': '',  # Retweets and favorites not available
                'favorites': '',
                'is_retweet': 'RT' in tweet_text
            })

        except Exception as e:
            print(f"Error processing tweet on page {page}: {e}")
            continue

    return page_data

# Function to save data to CSV
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, mode='a', index=False, header=not os.path.exists(filename))
    print(f"Data saved to {filename}")

# Main function to scrape all pages and save to CSV incrementally
def scrape_all_pages(start_page=1, max_pages=824):
    page = start_page
    while page <= max_pages:
        print(f"Scraping page {page}...")

        try:
            page_data = scrape_page(page)

            # If no data is returned, assume an error occurred
            if not page_data:
                print(f"No data on page {page}. Retrying...")
                time.sleep(5)  # Wait and retry
                continue

            # Append current page data to main data list
            tweets_data.extend(page_data)

            # Save each page incrementally to CSV
            save_to_csv(page_data, 'elonmusk_polititweet.csv')

            # Move to the next page
            page += 1
            time.sleep(1)  # Pause to avoid overloading the server

        except requests.exceptions.RequestException as e:
            print(f"Connection error on page {page}: {e}")
            print("Retrying after a short delay...")
            time.sleep(10)  # Wait 10 seconds before retrying

    print("All pages scraped and data saved.")

# Start scraping from the first page, adjust if restarting from a specific page
scrape_all_pages(start_page=1, max_pages=824)



1. **Web Scraping:**
   - Sends HTTP requests to the PolitiTweet website to retrieve tweet pages.
   - Parses HTML using BeautifulSoup to extract tweet text and metadata such as date and source.

2. **Data Processing:**
   - Extracts and formats relevant tweet details, including:
     - Text content
     - Posting date
     - User details (static placeholders for PolitiTweet)
     - Indicators for retweets and hashtags.

3. **Error Handling and Retries:**
   - Includes mechanisms to handle HTTP errors, connection issues, and malformed HTML.
   - Retries scraping when a page fails or returns no data.

4. **Data Storage:**
   - Saves extracted tweets to a CSV file incrementally.
   - Appends new data to the CSV after scraping each page, ensuring progress is saved.

5. **Customization:**
   - Allows starting from a specific page and defining the total number of pages to scrape (`start_page` and `max_pages`).

###  Google Cloud Storage in Colab

In [None]:
from google.colab import auth
auth.authenticate_user()


In [None]:
import pandas as pd

In [None]:
import pandas as pd
from google.cloud import storage

# Load the CSV file (optional if you only want to upload it)
df = pd.read_csv("elonmusk_polititweet.csv")

# Define local file path
local_file_path = "elonmusk_polititweet.csv"

# Set up Google Cloud Storage client
storage_client = storage.Client()

# Define bucket and destination in GCS
bucket_name = 'elon-musk-chatbot-data'
destination_blob_name = 'elonmusk_polititweet.csv'

def upload_to_gcs(bucket_name, local_file_path, destination_blob_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file_path)
    print(f"File {local_file_path} uploaded to {destination_blob_name} in bucket {bucket_name}.")

# Upload CSV to Google Cloud Storage
upload_to_gcs(bucket_name, local_file_path, destination_blob_name)


File elonmusk_polititweet.csv uploaded to elonmusk_polititweet.csv in bucket elon-musk-chatbot-data.


This script uploads a local CSV file (elonmusk_polititweet.csv) containing Elon Musk's tweets to a specified Google Cloud Storage bucket. The file can be used for further analysis or integration with other projects in the cloud.