# Extract top 100 hashtags per day from Ukraine Conflict Tweet Dataset
* need to first combine or split any csvs that are in multiple parts
    * can try to do it automatically using regex
* all csv files have the same three columns: tweetid, hashtags, language
    * only keep 'en' language tweets
    * only keep tweets with at least one hashtag

## Imports and Constants

In [1]:
import re
import os
import sys
import csv
import json
import shutil
import datetime
import calendar
import pandas as pd

PD_READ_CSV_LOW_MEMORY_FLAG = False # Set to True if your computer has low memory
CSV_WRITE_UTF_ENCODING = 'utf-8' #'utf16' #'utf-8' causes loss/corruption of certain characters
HASHTAG_FREQUENCY_THRESHOLD = 100 # Minimum number of times a hashtag must be used to be included in the top hashtags
NUMBER_TOP_HASHTAGS = 100 # Number of top hashtags to be included in the report/csv

DATA_FOLDER_PATH = '..\\data\\'

BACKUP_RAW_CSV_FILES_FOLDER_PATH = DATA_FOLDER_PATH + "archive_backup\\" # up Jupyter\ Folder, into data\archive_backup\
RAW_CSV_FILES_FOLDER_PATH = DATA_FOLDER_PATH + "archive\\" # up Jupyter\ Folder, into data\archive\
TOP_HASHTAGS_PER_DAY_FOLDER_PATH =  DATA_FOLDER_PATH + "top_hashtags_per_day\\" # up Jupyter\ Folder, into data\top_hashtags_per_day\
    

# Files/Data Setup
* if you want to start from scratch, remove everything in the `data\archive` folder and the 
    * the code will automatically copy the raw csvs from the `data\archive_backup` folder to the `data\archive` folder

In [2]:
# Folder creation

if not os.path.exists(BACKUP_RAW_CSV_FILES_FOLDER_PATH):
    print("Unable to find backup folder: " + BACKUP_RAW_CSV_FILES_FOLDER_PATH)
    os.sys.exit(1)

if not os.path.exists(RAW_CSV_FILES_FOLDER_PATH):
    os.makedirs(RAW_CSV_FILES_FOLDER_PATH)
    print("Created folder: " + RAW_CSV_FILES_FOLDER_PATH)
    
if not os.path.exists(TOP_HASHTAGS_PER_DAY_FOLDER_PATH):
    os.makedirs(TOP_HASHTAGS_PER_DAY_FOLDER_PATH)
    print("Created folder: " + TOP_HASHTAGS_PER_DAY_FOLDER_PATH)

## Comment the opening """ in order to delete all files in both the `data\archive` folder and the `data\top_hashtags_per_day` folder

In [3]:
# ONLY UNCOMMENT AND RUN THIS IF YOU WANT TO DELETE the contents of the archive\ and top_hashtags_per_day\ folder

"""
# https://stackoverflow.com/a/12526809
def delete_folder_contents(dirpath):
    for filename in os.listdir(dirpath):
        filepath = os.path.join(dirpath, filename)
        try:
            shutil.rmtree(filepath)
        except OSError:
            os.remove(filepath)

delete_folder_contents(RAW_CSV_FILES_FOLDER_PATH)
delete_folder_contents(TOP_HASHTAGS_PER_DAY_FOLDER_PATH)
           
#""" 

'\n# https://stackoverflow.com/a/12526809\ndef delete_folder_contents(dirpath):\n    for filename in os.listdir(dirpath):\n        filepath = os.path.join(dirpath, filename)\n        try:\n            shutil.rmtree(filepath)\n        except OSError:\n            os.remove(filepath)\n\ndelete_folder_contents(RAW_CSV_FILES_FOLDER_PATH)\ndelete_folder_contents(TOP_HASHTAGS_PER_DAY_FOLDER_PATH)\n           \n#'

# Copy raw csvs from `data\archive_backup` folder to `data\archive` folder (if not already there)

In [4]:

PERFORM_PROCESSING = False

raw_csv_backup_filenames = [f for f in os.listdir(BACKUP_RAW_CSV_FILES_FOLDER_PATH) if f.endswith('.csv')]
raw_csv_backup_filepaths = [os.path.join(BACKUP_RAW_CSV_FILES_FOLDER_PATH, f) for f in raw_csv_backup_filenames]
raw_csv_filepaths = [os.path.join(RAW_CSV_FILES_FOLDER_PATH, f) for f in raw_csv_backup_filenames]

# if both folders are empty, copy raw csvs from backup archive folder to raw archive folder
if len(os.listdir(RAW_CSV_FILES_FOLDER_PATH)) == 0 and len(os.listdir(TOP_HASHTAGS_PER_DAY_FOLDER_PATH)) == 0:
    print("No raw csv files found in: " + RAW_CSV_FILES_FOLDER_PATH + "\nCopying files from: " + BACKUP_RAW_CSV_FILES_FOLDER_PATH)

    for backup_filepath, filepath in zip(raw_csv_backup_filepaths, raw_csv_filepaths):
        shutil.copy(backup_filepath, filepath)
        
    PERFORM_PROCESSING = True
    
elif len(os.listdir(RAW_CSV_FILES_FOLDER_PATH)) > 0 and len(os.listdir(TOP_HASHTAGS_PER_DAY_FOLDER_PATH)) > 0:
    PERFORM_PROCESSING = False
    
elif len(os.listdir(RAW_CSV_FILES_FOLDER_PATH)) > 0 and len(os.listdir(TOP_HASHTAGS_PER_DAY_FOLDER_PATH)) == 0:
    PERFORM_PROCESSING = True

print(PERFORM_PROCESSING)

No raw csv files found in: ..\data\archive\
Copying files from: ..\data\archive_backup\
True


# Extract all Date-Filenames pairings from the `data\archive` folder using regex

In [5]:
# this is a guard clause to prevent the script from running if the output files are already in the archive\ folder
if not PERFORM_PROCESSING: 
    os.sys.exit(1) # exit with error code 1 if we don't need to perform processing   


# extract the date from the file name, to aid in joining/spliting the parts automatically
""" CSVs are in one of these formats:
    UkraineCombinedTweetsDeduped20220227-131611.csv
    UkraineCombinedTweetsDeduped_FEB27.csv
    UkraineCombinedTweetsDeduped_FEB28_part1.csv
    UkraineCombinedTweetsDeduped_MAR27_to_28.csv
    UkraineCombinedTweetsDeduped_MAR30_REAL.csv
    0401_0UkraineCombinedTweetsDeduped
    0505_to_0507_UkraineCombinedTweetsDeduped.csv
""" 
# regex for each of these cases, extract the month and day

UKRAINE_BLURB="UkraineCombinedTweetsDeduped"
FIRST_CSV_HOUR_MIN_SEC = '131611'
DATA_YEAR = 2022

month_abbr_to_int = dict((v,k) for v,k in zip([m.lower() for m in calendar.month_abbr[1:]], range(1, 13)))

regex1 = re.compile(r'^' + UKRAINE_BLURB + r'(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})-' + FIRST_CSV_HOUR_MIN_SEC + '.csv$')
regex2 = re.compile(r'^' + UKRAINE_BLURB + r'_(?P<month_abbr>\w{3})(?P<day>\d{2})\.csv$')
regex3 = re.compile(r'^' + UKRAINE_BLURB + r'_(?P<month_abbr>\w{3})(?P<day>\d{2})_part\d\.csv$')
regex4 = re.compile(r'^' + UKRAINE_BLURB + r'_(?P<month_abbr1>\w{3})(?P<day1>\d{2})_to_(?:\w{3})?(?P<day2>\d{2})\.csv$')
regex4_2 = re.compile(r'^' + UKRAINE_BLURB + r'_(?P<month_abbr1>\w{3})(?P<day1>\d{2})_to_(?P<month_abbr2>\w{3})(?P<day2>\d{2})\.csv$')
regex5 = re.compile(r'^' + UKRAINE_BLURB + r'_(?P<month_abbr>\w{3})(?P<day>\d{2})_REAL\.csv$')
regex6 = re.compile(r'^(?P<month>\d{2})(?P<day>\d{2})_' + UKRAINE_BLURB + r'.csv')
regex7 = re.compile(r'^(?P<month1>\d{2})(?P<day1>\d{2})_to_(?:\d{2})?(?P<day2>\d{2})_' + UKRAINE_BLURB + r'.csv')
regex7_2 = re.compile(r'^(?P<month1>\d{2})(?P<day1>\d{2})_to_(?P<month2>\d{2})(?P<day2>\d{2})_' + UKRAINE_BLURB + r'.csv')

def extract_date_range_from_filename(filename):
    m1 = re.match(regex1, filename)
    m2 = re.match(regex2, filename)
    m3 = re.match(regex3, filename)
    m4 = re.match(regex4, filename)
    m5 = re.match(regex5, filename)
    m6 = re.match(regex6, filename)
    m7 = re.match(regex7, filename)
    
    if m1:
        return [datetime.date(year=int(m1.group('year')), month=int(m1.group('month')), day=int(m1.group('day')))]
    if m2:
        return [datetime.date(year=DATA_YEAR, month=month_abbr_to_int[m2.group('month_abbr').lower()], day=int(m2.group('day')))]
    if m3:
        return [datetime.date(year=DATA_YEAR, month=month_abbr_to_int[m3.group('month_abbr').lower()], day=int(m3.group('day')))]
    if m4:
        sdate = datetime.date(year=DATA_YEAR, month=month_abbr_to_int[m4.group('month_abbr1').lower()], day=int(m4.group('day1')))
        
        m4_2 = re.match(regex4_2, filename) # this is the case where the 2nd month is not specified
        if m4_2:
            edate = datetime.date(year=DATA_YEAR, month=month_abbr_to_int[m4_2.group('month_abbr2').lower()], day=int(m4_2.group('day2')))
        else:
            edate = datetime.date(year=DATA_YEAR, month=month_abbr_to_int[m4.group('month_abbr1').lower()], day=int(m4.group('day2')))
       
        # https://stackoverflow.com/a/66595046
        return [sdate+datetime.timedelta(days=x) for x in range(0,(edate-sdate).days+1)]
    if m5:
        return [datetime.date(year=DATA_YEAR, month=month_abbr_to_int[m5.group('month_abbr').lower()], day=int(m5.group('day')))]
    if m6:
        return [datetime.date(year=DATA_YEAR, month=int(m6.group('month')), day=int(m6.group('day')))]
    if m7:
        sdate = datetime.date(year=DATA_YEAR, month=int(m7.group('month1')), day=int(m7.group('day1')))
        
        m7_2 = re.match(regex7_2, filename) # this is for the case where the second month is not specified
        
        if m7_2:
            edate = datetime.date(year=DATA_YEAR, month=int(m7_2.group('month2')), day=int(m7.group('day2'))) 
        else:
            edate = datetime.date(year=DATA_YEAR, month=int(m7.group('month1')), day=int(m7.group('day2'))) 
            
        # https://stackoverflow.com/a/66595046
        return [sdate+datetime.timedelta(days=x) for x in range(0,(edate-sdate).days+1)]

            
############### MAIN SCRIPT STARTS HERE ###################

# Get dictionary of dates to filenames
date_to_csv_filenames = {}

for filename in os.listdir(RAW_CSV_FILES_FOLDER_PATH):
    if filename.endswith('.csv'):
        date_range = extract_date_range_from_filename(filename)
        if date_range is not None:
            for date in date_range:
                if date not in date_to_csv_filenames:
                    date_to_csv_filenames[date] = []
                date_to_csv_filenames[date].append(filename)

print(date_to_csv_filenames)


{datetime.date(2022, 6, 13): ['0613_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 14): ['0614_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 15): ['0615_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 16): ['0616_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 17): ['0617_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 18): ['0618_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 19): ['0619_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 20): ['0620_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 21): ['0621_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 22): ['0622_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 23): ['0623_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 24): ['0624_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 25): ['0625_UkraineCombinedTweetsDeduped.csv'], datetime.date(2022, 6, 26): ['0626_UkraineCombinedTweetsDeduped.csv'], datet

# Combine/Split and Rename raw csv files as needed across each Day

In [6]:
if len(os.listdir(RAW_CSV_FILES_FOLDER_PATH)) > 0 and len(os.listdir(TOP_HASHTAGS_PER_DAY_FOLDER_PATH)) == 0:
    PERFORM_PROCESSING = True

# this is a guard clause to prevent the script from running if the output files are already in the archive\ folder
if not PERFORM_PROCESSING: 
    os.sys.exit(1) # exit with error code 1 if we don't need to perform processing   

def get_new_filename_from_date(date):
    return f'{date.year:04d}_{date.month:02d}_{date.day:02d}.csv'

def combine_files_and_remove_predecessors(date, old_filenames):
    print('Combining files ' + str(old_filenames) + ' for date: ' + str(date))
    
    dfs_to_combine = []
    
    for old_filename in old_filenames:
        dfs_to_combine.append(pd.read_csv(RAW_CSV_FILES_FOLDER_PATH + '/' + old_filename, low_memory=PD_READ_CSV_LOW_MEMORY_FLAG))
    
    df_combined = pd.concat(dfs_to_combine)
        
    df_combined.to_csv(RAW_CSV_FILES_FOLDER_PATH + '/' + get_new_filename_from_date(date), index=False)
    
    # remove old files
    for old_filename in old_filenames:
        filepath = os.path.join(RAW_CSV_FILES_FOLDER_PATH, old_filename)
        print("Removing file: " + old_filename)
        try:
            shutil.rmtree(filepath)
        except OSError:
            os.remove(filepath)


# from https://ws-dl.blogspot.com/2019/08/2019-08-03-tweetedat-finding-tweet.html
def get_date_from_tweetid(tid):
    offset = 1288834974657 # UTC offset in milliseconds
    tstamp = (tid >> 22) + offset
    return datetime.datetime.utcfromtimestamp(tstamp/1000).date()


def split_file_over_necessary_dates(filename, dates):
    print("Spliting filename: " + filename + " over " + str(dates)) 
    
    df_to_split = pd.read_csv(RAW_CSV_FILES_FOLDER_PATH + '/' + filename, low_memory=PD_READ_CSV_LOW_MEMORY_FLAG)
    df_to_split['date'] = df_to_split['tweetid'].map(get_date_from_tweetid)
    
    # Create a new dataframe for each date, by filtering the original dataframe by the date
    for date in dates:
        df_date = df_to_split[pd.to_datetime(df_to_split['date']).dt.date == date]
        
        df_date.to_csv(RAW_CSV_FILES_FOLDER_PATH + '/' + get_new_filename_from_date(date), index=False)
        
    # Remove the original file
    filepath = os.path.join(RAW_CSV_FILES_FOLDER_PATH, filename)
    print("Removing file: " + filename)
    try:
        shutil.rmtree(filepath)
    except OSError:
        os.remove(filepath)
   

############### MAIN SCRIPT STARTS HERE ###################

last_date = None
last_filename = ""
current_filename = ""
num_dates_emcompassed_by_single_file = 1

for date in date_to_csv_filenames:
    current_filename = date_to_csv_filenames[date][0]
    
    if len(date_to_csv_filenames[date]) > 1:
        
        combine_files_and_remove_predecessors(date, date_to_csv_filenames[date])
        
    elif current_filename != last_filename and num_dates_emcompassed_by_single_file > 1:
        
        dates = list(reversed([last_date-datetime.timedelta(days=x) for x in range(0,num_dates_emcompassed_by_single_file)]))
        split_file_over_necessary_dates(last_filename, dates)
        
        num_dates_emcompassed_by_single_file = 1

    if last_filename == current_filename:
        num_dates_emcompassed_by_single_file += 1    
       
    last_date = date 
    last_filename = current_filename

# Case where the last date is part of a file that needs to be split
if current_filename != last_filename and num_dates_emcompassed_by_single_file > 1:
    dates = list(reversed([last_date-datetime.timedelta(days=x) for x in range(0,num_dates_emcompassed_by_single_file)]))
    split_file_over_necessary_dates(last_filename, dates)
    
    
# Rename all files to the format indicated by get_new_filename_from_date(date)
print(f'Renaming files in {RAW_CSV_FILES_FOLDER_PATH} to match format: {get_new_filename_from_date(datetime.datetime.now().date())}') 

# matches filenames of the form: 2019_08_03.csv, as defined by the get_new_filename_from_date(date) function
regex8 = re.compile(r'^(?P<year>\d{4})_(?P<month>\d{2})_(?P<day>\d{2}).csv$') 

for filename in os.listdir(RAW_CSV_FILES_FOLDER_PATH):
    if not re.match(regex8, filename): # if the filename does not match the regex, rename it
        os.rename(RAW_CSV_FILES_FOLDER_PATH + '/' + filename,
                  RAW_CSV_FILES_FOLDER_PATH + '/' + get_new_filename_from_date(extract_date_range_from_filename(filename)[0]))

Renaming files in ..\data\archive\ to match format: 2022_07_26.csv


# Apply Processing to the raw data

In [7]:
# this is a guard clause to prevent the script from running if the output files are already in the archive\ folder
if not PERFORM_PROCESSING: 
    os.sys.exit(1) # exit with error code 1 if we don't need to perform processing   

def simplify_hashtags(htag_json):
    htag_json = htag_json.replace('\'','\"')
    return [ht['text'].lower() for ht in json.loads(htag_json)]
    
    
# from https://ws-dl.blogspot.com/2019/08/2019-08-03-tweetedat-finding-tweet.html
def get_tweet_timestamp(tid):
    offset = 1288834974657 # UTC offset in milliseconds
    tstamp = (tid >> 22) + offset
    return datetime.datetime.utcfromtimestamp(tstamp/1000)


def clean_ukraine_conflict_twitter_dataframe(df):

    # only keep English tweets with non-empty hashtags hashtags
    df = df.loc[(df['language'].map(lambda d: d == 'en')) & (df['hashtags'].map(lambda d: d != '[]')), ['tweetid','hashtags']] 

    df['hashtags'] = df['hashtags'].map(simplify_hashtags)

    df['tweet_timestamp'] = df['tweetid'].map(get_tweet_timestamp)

    return df


def get_top_n_hashtags_to_freq_dict(df, num_top_hashtags=NUMBER_TOP_HASHTAGS):

    hashtag_to_freq = {}

    # count hashtags
    for ht_list in df['hashtags'].to_dict().values(): # key is index, value is list of hashtags
        for ht in ht_list:
            if ht not in hashtag_to_freq:
                hashtag_to_freq[ht] = 1
            else:
                hashtag_to_freq[ht] += 1
                
    # remove hashtags below threshold (for faster sorting, smaller filesizes)
    for ht in list(hashtag_to_freq.keys()):
        if hashtag_to_freq[ht] < HASHTAG_FREQUENCY_THRESHOLD:
            del hashtag_to_freq[ht]
        
    # sort hashtags by frequency
    return dict(sorted(hashtag_to_freq.items(), key=lambda item: item[1], reverse=True)[0:num_top_hashtags])


def write_top_n_hashtags_to_csv(hashtag_to_freq_dict, filename, field_names):
    with open(os.path.join(TOP_HASHTAGS_PER_DAY_FOLDER_PATH, filename), 'w', encoding=CSV_WRITE_UTF_ENCODING) as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=field_names, lineterminator = '\n')
        writer.writeheader()
        
        list_of_fieldname_dicts = []
        for ht, freq in hashtag_to_freq_dict.items():
            list_of_fieldname_dicts.append({'hashtag': ht, 'frequency': freq})
            
        writer.writerows(list_of_fieldname_dicts)


############### MAIN SCRIPT STARTS HERE ###################

field_names = ['hashtag','frequency']

# these files should be properly organized by date now
for filename in os.listdir(RAW_CSV_FILES_FOLDER_PATH):
    df = pd.read_csv(os.path.join(RAW_CSV_FILES_FOLDER_PATH, filename), low_memory=PD_READ_CSV_LOW_MEMORY_FLAG)
    df = clean_ukraine_conflict_twitter_dataframe(df)
    hashtag_to_freq_dict = get_top_n_hashtags_to_freq_dict(df, NUMBER_TOP_HASHTAGS)
    
    
    write_top_n_hashtags_to_csv(hashtag_to_freq_dict, filename, field_names)
    print("Wrote top " + str(NUMBER_TOP_HASHTAGS) + " hashtags to file: " + filename)
    

Wrote top 100 hashtags to file: 2022_06_13.csv
Wrote top 100 hashtags to file: 2022_06_14.csv
Wrote top 100 hashtags to file: 2022_06_15.csv
Wrote top 100 hashtags to file: 2022_06_16.csv
Wrote top 100 hashtags to file: 2022_06_17.csv
Wrote top 100 hashtags to file: 2022_06_18.csv
Wrote top 100 hashtags to file: 2022_06_19.csv
Wrote top 100 hashtags to file: 2022_06_20.csv
Wrote top 100 hashtags to file: 2022_06_21.csv
Wrote top 100 hashtags to file: 2022_06_22.csv
Wrote top 100 hashtags to file: 2022_06_23.csv
Wrote top 100 hashtags to file: 2022_06_24.csv
Wrote top 100 hashtags to file: 2022_06_25.csv
Wrote top 100 hashtags to file: 2022_06_26.csv
Wrote top 100 hashtags to file: 2022_06_27.csv
Wrote top 100 hashtags to file: 2022_06_28.csv
Wrote top 100 hashtags to file: 2022_06_29.csv
Wrote top 100 hashtags to file: 2022_06_30.csv
Wrote top 100 hashtags to file: 2022_07_01.csv
Wrote top 100 hashtags to file: 2022_07_02.csv
Wrote top 100 hashtags to file: 2022_07_03.csv
Wrote top 100