In [2]:
!pip3 install snscrape

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snscrape
  Downloading snscrape-0.5.0.20230113-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.2/69.2 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: snscrape
Successfully installed snscrape-0.5.0.20230113


In [4]:
# importing libraries and packages
import os
import pandas as pd
import datetime
import snscrape.modules.twitter as sntwitter
from pytz import timezone

# Define the keywords and date range
keywords = ['Us Healthcare', '#Ushealthcare', 'Medicaid', 'medicare', 'tricare', 'veterans care', 'medical debt', 'medical Cost', '#Medicaid', '#medicare', '#tricare', '#veterans care', '#medicaldebt', '#medicalcost']
start_date = '2021-01-01'
end_date = '2022-12-31'

# Define the output directory
output_dir = '/content/drive/MyDrive/colabDatasets/Twitter_tweets_Thesis/Mined By Date/Month Interated'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print('Directory Not Present : Creating')

# Define the progress file
progress_file = os.path.join(output_dir, 'progress.csv')
progress_df = pd.DataFrame(columns=['Month', 'Keyword', 'Tweets'])

# If the progress file exists, read it and resume job
if os.path.exists(progress_file):
    progress_df = pd.read_csv(progress_file)
    print('Directory Present : Reading Progress Tracker')
    print('Resuming Job')

# Iterate through the keywords
for keyword in keywords:
    print('Mining tweets for keyword:', keyword)
    # Iterate through each month in the date range
    start_month = datetime.datetime.strptime(start_date, '%Y-%m-%d').date().replace(day=1)
    end_month = datetime.datetime.strptime(end_date, '%Y-%m-%d').date().replace(day=1)
    for month in pd.date_range(start=start_month, end=end_month, freq='MS'):
        print('Mining tweets for month:', month.strftime('%Y-%m'))
        # If  already mined this keyword for this month, skip it
        if not progress_df.loc[(progress_df['Keyword'] == keyword) & (progress_df['Month'] == month.strftime('%Y-%m'))].empty:
            print(f'Skipping {keyword} for {month.strftime("%Y-%m")} as it has already been mined')
            continue
        # Define the file path for this keyword and month
        file_name = f"{month.strftime('%Y-%m')}_{keyword}.xlsx"
        file_path = os.path.join(output_dir, file_name)
        # Using TwitterSearchScraper to scrape data and append tweets to list
        tweets_list = []
        for i, tweet in enumerate(sntwitter.TwitterSearchScraper(f'{keyword} since:{month.strftime("%Y-%m-%d")} until:{(month + pd.DateOffset(months=1)).strftime("%Y-%m-%d")}').get_items()):
            
            #if i >= 30:
            #    break
            # Convert datetime to timezone-naive datetime
            tweet_date = tweet.date.astimezone(timezone('UTC')).replace(tzinfo=None)
            tweets_list.append([tweet_date, tweet.id, tweet.content, tweet.username])
        # Creating a dataframe from the tweets list above
        tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])
        # Save the dataframe to Excel file
        tweets_df.to_excel(file_path, index=False)
        print(f'Saved {len(tweets_list)} tweets for {keyword} in {month.strftime("%Y-%m")} to file')
        # Record the number of tweets in the progress file
        tweets_count = len(tweets_list)
        progress_df = progress_df.append({'Month': month.strftime('%Y-%m'), 'Keyword': keyword, 'Tweets': tweets_count}, ignore_index=True)
        progress_df.to_csv(progress_file, index=False)



Directory Present : Reading Progress Tracker
Resuming Job
Mining tweets for keyword: Us Healthcare
Mining tweets for month: 2021-01
Skipping Us Healthcare for 2021-01 as it has already been mined
Mining tweets for month: 2021-02
Skipping Us Healthcare for 2021-02 as it has already been mined
Mining tweets for month: 2021-03
Skipping Us Healthcare for 2021-03 as it has already been mined
Mining tweets for month: 2021-04
Skipping Us Healthcare for 2021-04 as it has already been mined
Mining tweets for month: 2021-05
Skipping Us Healthcare for 2021-05 as it has already been mined
Mining tweets for month: 2021-06
Skipping Us Healthcare for 2021-06 as it has already been mined
Mining tweets for month: 2021-07
Skipping Us Healthcare for 2021-07 as it has already been mined
Mining tweets for month: 2021-08
Skipping Us Healthcare for 2021-08 as it has already been mined
Mining tweets for month: 2021-09
Skipping Us Healthcare for 2021-09 as it has already been mined
Mining tweets for month: 202