## Concatonate & Dedup Datasets

In [1]:
# Import packages
import os
import pandas as pd

#### Get pathways of all collected data

In [2]:
# Set pathway to get to collected data
data_path = '../Data_Collection/output'

# Instantiate lists to hold pathways for files of each topic
election_files = []
abrams_files = []
kemp_files = []

# Loop over all files of collected data and append file pathways to appropriate topic list
for file in os.listdir(data_path):
    complete_path = data_path + '/' + file
    if file.startswith('election'):
        election_files.append(complete_path)
    elif file.startswith('abrams'):
        abrams_files.append(complete_path)
    elif file.startswith('kemp'):
        kemp_files.append(complete_path)

# See lists of filepaths for each topic
print('General Election:\n', election_files)
print('Abrams:\n', abrams_files)
print('Kemp:\n', kemp_files)

General Election:
 ['../Data_Collection/output/election_1030.csv', '../Data_Collection/output/election_1025.csv']
Abrams:
 ['../Data_Collection/output/abrams_1030.csv', '../Data_Collection/output/abrams_1025.csv']
Kemp:
 ['../Data_Collection/output/kemp_1030.csv', '../Data_Collection/output/kemp_1025.csv']


#### Concatonate and Dedup Datasets for Each Topic

In [3]:
# Get earliest date in election dataset
earliest_date_df = pd.read_csv('../Data_Collection/output/election_1025.csv')
earliest_date = min(pd.to_datetime(earliest_date_df['created_at']))
earliest_date

Timestamp('2022-10-17 08:10:19')

In [4]:
# Method to concatonate and deduplicate datasets
def handle_datasets(list_of_file_path, min_date):

    # Read in all csv files into single concatonated dataframe
    df = pd.concat(map(pd.read_csv, list_of_file_path), ignore_index=True)
    
    # Convert created_at column to type datetime
    df['created_at'] = pd.to_datetime(df['created_at'])

    # Remove any tweets before earliest allowed date
    df = df[df['created_at'] >= min_date]

    # Drop duplicates and print number of rows removed as duplicates
    rows_start = len(df)
    df.sort_values(by=['created_at', 
                       'retweet_count', 
                       'favorite_count', 
                       'user_friends_count',
                       'user_followers_count', 
                       'user_favourites_count',
                       'user_statuses_count'], 
                   ascending=[True, False, False, False, False, False, False], 
                   ignore_index=True,
                   inplace=True)
    df.drop_duplicates(subset=['created_at', 'text', 'user_id'], 
                       inplace=True)
    rows_end = len(df)
    print('Number of Duplicate Rows Removed:', rows_start-rows_end)

    # Return the final dataframe
    return df

**General Election Data:**

In [6]:
# Concatonate and dedup election datasets
election_df = handle_datasets(election_files, earliest_date)

# See shape and head of dataframe
print('Election Dataframe Shape:', election_df.shape)
election_df.head(3)

Number of Duplicate Rows Removed: 739
Election Dataframe Shape: (3795, 20)


Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,coordinates,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count
0,2022-10-17 08:10:19,"Looks like c-span is carrying it live. ,",[],[1553531142321709056],['CaseyReber35'],0,1,1.553531e+18,CaseyReber35,,,754913164827959300,WahcaMia,MiaMarie #TeamShapiroBetoAbramsDemingsMcMullin...,On a Mountain Somewhere,5467,5743,291152,False,253115
1,2022-10-17 08:16:09,Georgia just became the latest state to requir...,['rxa'],[],[],0,0,,,,,757814965,KinteSpace,the kinte space,Los Angeles,300,480,28,False,30228
2,2022-10-17 08:17:18,"With the Governor election just weeks away, a ...","['Georgia', 'Democrat', 'BLEXIT']",[216065430],['staceyabrams'],0,0,,,,,1519406689451184129,EddieTarazonaFL,Eddie Tarazona,"Florida, USA",629,88,608,False,1952


**Abrams Data:**

In [7]:
# Concatonate and dedup abrams datasets
abrams_df = handle_datasets(abrams_files, earliest_date)

# See shape and head of dataframe
print('Abrams Dataframe Shape:', abrams_df.shape)
abrams_df.head(3)

Number of Duplicate Rows Removed: 5564
Abrams Dataframe Shape: (26535, 20)


Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,coordinates,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count
0,2022-10-17 08:12:17,Oh great another Stacey Abrams,[],[759251],['CNN'],0,0,759251.0,CNN,,,1307710729957437440,michael_rah_SF,Low-intensity High-perdiem,,298,37,5983,False,3501
1,2022-10-17 08:17:39,Stacey Abrams took Kemp to court over his canc...,[],[91882544],['DineshDSouza'],0,0,91882544.0,DineshDSouza,,,2729832932,KalDraken,KalDraken,,93,111,6791,False,2359
2,2022-10-17 08:24:36,Barack Obama to host October midterm rallies t...,[],[],[],0,0,,,,,1963887212,Fam4Fun,KP🙌🏿💙🙌🏻,"California, USA",3172,3865,183381,False,46788


**Kemp Data:**

In [8]:
# Concatonate and dedup kemp datasets
kemp_df = handle_datasets(kemp_files, earliest_date)

# See shape and head of dataframe
print('Kemp Dataframe Shape:', kemp_df.shape)
kemp_df.head(3)

Number of Duplicate Rows Removed: 1411
Kemp Dataframe Shape: (7958, 20)


Unnamed: 0,created_at,text,hashtags,user_mention_ids,user_mention_screen_names,retweet_count,favorite_count,in_reply_to_user_id,in_reply_to_screen_name,geo,coordinates,user_id,user_screen_name,user_name,user_location,user_friends_count,user_followers_count,user_favourites_count,user_verfied,user_statuses_count
0,2022-10-17 08:16:09,Georgia just became the latest state to requir...,['rxa'],[],[],0,0,,,,,757814965,KinteSpace,the kinte space,Los Angeles,300,480,28,False,30222
1,2022-10-17 09:02:31,Brian Kemp to battle Stacey Abrams in gubernat...,[],[10228272],['YouTube'],0,0,,,,,3193701535,lklonelove,Lawrence K Lancaster,"Los Angeles, CA",4318,1556,28,False,12896
2,2022-10-17 09:14:47,HOW MANY GEORGIANS DIE EACH YEAR DUE TO GOVERN...,[],"[739844197935644672, 612557998, 216065430]","['AmoneyResists', 'ccharlamb8', 'staceyabrams']",0,0,26241121.0,FirstJamesBond,,,26241121,FirstJamesBond,Jim Bond,"Dallas, Texas",3006,756,23094,False,60100


#### Combine all dataframes and dedup

In [9]:
# Concatonate all dataframes
all_df = pd.concat([election_df,abrams_df,kemp_df])

# Deduplicate rows
rows_start = len(all_df)
all_df.sort_values(by=['created_at', 
                       'retweet_count', 
                       'favorite_count', 
                       'user_friends_count',
                       'user_followers_count', 
                       'user_favourites_count',
                       'user_statuses_count'], 
                   ascending=[True, False, False, False, False, False, False], 
                   ignore_index=True,
                   inplace=True)
all_df.drop_duplicates(subset=['created_at', 'text', 'user_id'], 
                       inplace=True)
rows_end = len(all_df)
print('Number of Duplicate Rows Removed:', rows_start-rows_end)

# Get all data's shape
print('All Dataframe Shape:', all_df.shape)

Number of Duplicate Rows Removed: 4071
All Dataframe Shape: (34217, 20)


#### Save all dataframes to csvs

In [10]:
election_df.to_csv('election_data.csv')
abrams_df.to_csv('abrams_data.csv')
kemp_df.to_csv('kemp_data.csv')
all_df.to_csv('all_data.csv')