## Concatonate & Dedup Datasets

In [1]:
# Import packages
import os
import pandas as pd

#### Get pathways of all collected data

In [2]:
# Set pathway to get to collected data
data_path = '../Data_Collection/output'

# Instantiate lists to hold pathways for files of each topic
election_files = []
abrams_files = []
kemp_files = []

# Loop over all files of collected data and append file pathways to appropriate topic list
for file in os.listdir(data_path):
    complete_path = data_path + '/' + file
    if file.startswith('election'):
        election_files.append(complete_path)
    elif file.startswith('abrams'):
        abrams_files.append(complete_path)
    elif file.startswith('kemp'):
        kemp_files.append(complete_path)

# See lists of filepaths for each topic
print('General Election:\n', election_files)
print('Abrams:\n', abrams_files)
print('Kemp:\n', kemp_files)

General Election:
 ['../Data_Collection/output/election_1109.csv', '../Data_Collection/output/election_1030.csv', '../Data_Collection/output/election_1025.csv', '../Data_Collection/output/election_1103.csv']
Abrams:
 ['../Data_Collection/output/abrams_1103.csv', '../Data_Collection/output/abrams_1030.csv', '../Data_Collection/output/abrams_1025.csv', '../Data_Collection/output/abrams_1109.csv']
Kemp:
 ['../Data_Collection/output/kemp_1103.csv', '../Data_Collection/output/kemp_1030.csv', '../Data_Collection/output/kemp_1025.csv', '../Data_Collection/output/kemp_1109.csv']


#### Concatonate and Dedup Datasets for Each Topic

In [3]:
# Import package
from datetime import datetime

# Get start date at beginning of first day of complete data
earliest_date = datetime.strptime('10/18/22 00:00:00', '%m/%d/%y %H:%M:%S')
print('Start Date:', earliest_date)

# Get end date at end of last day of election
latest_date = datetime.strptime('11/09/22 00:00:00', '%m/%d/%y %H:%M:%S')
print('End Date:', latest_date)

Start Date: 2022-10-18 00:00:00
End Date: 2022-11-09 00:00:00


In [4]:
# Method to concatonate and deduplicate datasets
def handle_datasets(list_of_file_path, min_date, max_date, topic):

    # Read in all csv files into single concatonated dataframe
    df = pd.concat(map(pd.read_csv, list_of_file_path), ignore_index=True)
    
    # Convert created_at column to type datetime
    df['created_at'] = pd.to_datetime(df['created_at'])

    # Remove any tweets before earliest allowed date
    df = df[df['created_at'] >= min_date]
    
    # Remove any tweets after latest allowed date
    df = df[df['created_at'] < max_date]

    # Drop duplicates and print number of rows removed as duplicates
    df.sort_values(by=['created_at', 
                       'retweet_count', 
                       'favorite_count', 
                       'user_friends_count',
                       'user_followers_count', 
                       'user_favourites_count',
                       'user_statuses_count'], 
                   ascending=[True, False, False, False, False, False, False], 
                   ignore_index=True,
                   inplace=True)
    df.drop_duplicates(subset=['created_at', 'text', 'user_id'], 
                       inplace=True)

    # Add column to indicate topic
    df['topic'] = topic
    
    # Return the final dataframe
    return df

**General Election Data:**

In [5]:
# Concatonate and dedup election datasets
election_df = handle_datasets(election_files, earliest_date, latest_date, 'election')

# See shape and head of dataframe
print('Election Dataframe Shape:', election_df.shape)
election_df.head(3)

Election Dataframe Shape: (6291, 21)


Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,...,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic
0,2022-10-18 00:00:31,begins TODAY in Georgia. Skip the crowds by go...,"['EarlyVoting', 'gapol', 'earlyvoting', 'yourv...",[],[],1,2,,,,...,22877669,CHRIS180_Change,CHRIS 180,"Metro Atlanta, GA",350,1647,1189,False,2985,election
1,2022-10-18 00:00:36,Who won the Georgia Governor debate?,"['GApol', 'GAGov']",[],[],1,2,,,,...,1312203178226651137,dw_subbed,Jensen will win in 14 days,,750,469,15197,False,41634,election
2,2022-10-18 00:00:37,"Stacey Abrams, Brian Kemp face off in Georgia ...",[],[],[],0,0,,,,...,1569940749000785926,hjtvnews_in,hjtvnews.in,,2,4,0,False,19018,election


**Abrams Data:**

In [6]:
# Concatonate and dedup abrams datasets
abrams_df = handle_datasets(abrams_files, earliest_date, latest_date, 'abrams')

# See shape and head of dataframe
print('Abrams Dataframe Shape:', abrams_df.shape)
abrams_df.head(3)

Abrams Dataframe Shape: (35883, 21)


Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,...,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic
0,2022-10-18 00:00:00,Sharp words on guns in Shane Hazel to Stacey A...,['gagovdebate'],[],[],5,24,,,,...,25282846,SimonesNews,Simone Sebastian,Washington DC,3110,5830,1445,True,4400,abrams
1,2022-10-18 00:00:01,Stacey Abrams won tonight. She kept to the fac...,[],[],[],0,6,,,,...,1312393604439183361,nching0,Thee Lost Edges of Candace 🪥,"34.2073° N, 84.1402° W",922,752,101529,False,61963,abrams
2,2022-10-18 00:00:01,"Why did Joe Rogan send his little brother, Sha...",['GAGovDebate'],[],[],0,5,,,,...,897218253826555905,JTaylorSkinner,Jenn Taylor-Skinner (she/her),Seattle,17762,25727,82402,False,43808,abrams


**Kemp Data:**

In [7]:
# Concatonate and dedup kemp datasets
kemp_df = handle_datasets(kemp_files, earliest_date, latest_date, 'kemp')

# See shape and head of dataframe
print('Kemp Dataframe Shape:', kemp_df.shape)
kemp_df.head(3)

Kemp Dataframe Shape: (14735, 21)


Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,...,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count,topic
0,2022-10-18 00:00:11,THE MOST DANGEROUS THING FACING GEORGIA IS 4 M...,[],[],[],212,528,,,,...,1169707149167140867,MentallyDivine,Brian Baez,"Atlanta, GA",1990,9076,42697,False,9657,kemp
1,2022-10-18 00:00:16,Mrs. Abrams showed the same poise and skill as...,[],[345110600],['tify330'],1,14,345110600.0,tify330,,...,33382494,RobPalmerIRL,Rob Palmer IRL🇺🇦 🏳️‍🌈,,447,1240,43289,False,53336,kemp
2,2022-10-18 00:00:19,"Stacey Abrams: ""The most dangerous thing facin...","['GAGov', 'GAGovDebate']",[],[],2,5,,,,...,16214875,4everVaughn,Vaughn A.,"Atlanta, GA",2436,2703,347,False,23267,kemp


#### Combine all dataframes and dedup

In [8]:
# Concatonate all dataframes
all_df = pd.concat([election_df,abrams_df,kemp_df])

# Get count of words before deduplication
rows_start = len(all_df)

# Drop exact duplicates
all_df.drop_duplicates(inplace=True)

# Get all topics mentioned for each tweet
tweet_topics = all_df.groupby(['created_at', 'text']).agg({'topic': lambda d: ", ".join(set(d))})

# Add these combined topics back into dataframe
all_df = all_df.merge(tweet_topics, how='left', on=['created_at', 'text'])
all_df = all_df.drop('topic_x', axis=1)
all_df = all_df.rename({'topic_y':'topics'})

# Sort rows as indicated
all_df.sort_values(by=['created_at', 
                       'retweet_count', 
                       'favorite_count', 
                       'user_friends_count',
                       'user_followers_count', 
                       'user_favourites_count',
                       'user_statuses_count'], 
                   ascending=[True, False, False, False, False, False, False], 
                   ignore_index=True,
                   inplace=True)

# Deduplicate rows
all_df.drop_duplicates(subset=['created_at', 'text', 'user_id'], 
                       inplace=True)

# See how many rows were removed as duplicates
print('Number of Duplicate Rows Removed:', rows_start-len(all_df))

# Get all data's shape
print('All Dataframe Shape:', all_df.shape)

Number of Duplicate Rows Removed: 5573
All Dataframe Shape: (51336, 21)


#### Save all dataframes to csvs

In [9]:
election_df.to_csv('election_data.csv')
abrams_df.to_csv('abrams_data.csv')
kemp_df.to_csv('kemp_data.csv')
all_df.to_csv('all_data.csv')