In [None]:
import pandas as pd
import sys
sys.path.append('..')
sys.path.append('../utils')
from ETL_utils import*

# Understanding the data

Each user sample contains:
- 'ID': the ID from Twitter identifying the user.
- 'profile': the profile information obtained from Twitter API.
- 'tweet': the recent 200 tweets of this user.
- 'neighbor': the random 20 followers and followings of this user.
- 'domain': the domain of this user and the domains include politics, business, entertainment and sports.
- 'label': the label of this user and '1' means it is a bot while '0' means it is a human.

# Create Dataset of tweets per user

In [None]:
# For tweets DataFrame
df_tweets_train = make_tweets_df("../Datasets/Raw/Twi20/train.json")
df_tweets_test = make_tweets_df("../Datasets/Raw/Twi20/test.json")
df_tweets = pd.concat([df_tweets_train, df_tweets_test], axis=0)
df_tweets = df_tweets.reset_index(drop=True)

In [None]:
df_tweets

Unnamed: 0,tweet,ID
0,RT @CarnivalCruise: 🎉 Are you ready to see wha...,17461978
1,Who has time for receipts? Not me. @epson rece...,17461978
2,Steady wants to encourage you to invest in you...,17461978
3,"Good one, @rishid. But let’s see if y'all can ...",17461978
4,#lsunationalchamps\n,17461978
...,...,...
1598323,"Man, the 70s was a bad-looking decade. Take th...",3385331674
1598324,RT @RobinsonCano: The RC22 DREAM School is sta...,3385331674
1598325,RT @JonHeymanCBS: this is last season for @Don...,3385331674
1598326,RT @whitesox: #SoxSurprise! @BoJackson takes o...,3385331674


In [None]:
df_tweets['ID'].nunique() #number of users

9461

# Get domain of interest of each user

In [None]:
# For ID and domain DataFrame
df_id_domain_train = create_id_domain_df("../Datasets/Raw/Twi20/train.json")
df_id_domain_test = create_id_domain_df("../Datasets/Raw/Twi20/test.json")
df_id_domain = pd.concat([df_id_domain_train, df_id_domain_test], axis=0)
df_id_domain = df_id_domain.reset_index(drop=True)

In [None]:
all_domains = df_id_domain['domain'].str.split(', ').explode().unique()
all_domains

array(['Politics', 'Business', 'Entertainment', 'Sports'], dtype=object)

In [None]:
df_id_domain

Unnamed: 0,ID,domain
0,17461978,"Politics, Business, Entertainment"
1,1297437077403885568,Politics
2,17685258,"Politics, Entertainment, Sports"
3,15750898,Politics
4,1659167666,Politics
...,...,...
9456,452754350,Sports
9457,850435801687183360,Sports
9458,2188795745,Sports
9459,940687680,Sports


# Create Dataset of profile for each user

In [None]:
# Lista das colunas usadas para selecionar dados específicos de um DataFrame.
USECOLS = [
    'id',
    'id_str',
    'name',
    'screen_name',
    'location',
    'profile_location',
    'description',
    # 'url',
    # 'entities',
    'protected',
    'followers_count',
    'friends_count',
    'listed_count',
    'created_at',
    'favourites_count',
    'utc_offset',
    'time_zone',
    'geo_enabled',
    'verified',
    'statuses_count',
    'lang',
    'contributors_enabled',
    'is_translator',
    'is_translation_enabled',
    'profile_background_color',
    'profile_background_image_url',
    'profile_background_image_url_https',
    'profile_background_tile',
    'profile_image_url',
    'profile_image_url_https',
    'profile_link_color',
    'profile_sidebar_border_color',
    'profile_sidebar_fill_color',
    'profile_text_color',
    'profile_use_background_image',
    'has_extended_profile',
    'default_profile',
    'default_profile_image']

In [None]:
# For profile DataFrame
df_profile_train = make_profile_df("../Datasets/Raw/Twi20/train.json", usecols=USECOLS)
df_profile_test = make_profile_df("../Datasets/Raw/Twi20/test.json", usecols=USECOLS)
df_profile = pd.concat([df_profile_train, df_profile_test], axis=0)
df_profile = df_profile.reset_index(drop=True)

In [None]:
# Verifica se as colunas 'id' e 'id_str' são iguais
if 'id_str' in df_profile.columns and df_profile['id'].equals(df_profile['id_str']):
    # Remove a coluna 'id_str'
    df_profile.drop(columns=['id_str'], inplace=True)

# Renomeia a coluna 'id' para 'ID'
df_profile.rename(columns={'id': 'ID'}, inplace=True)

In [None]:
df_profile.head()

Unnamed: 0,ID,name,screen_name,location,profile_location,description,protected,followers_count,friends_count,listed_count,...,profile_image_url,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image
0,17461978,SHAQ,SHAQ,"Orlando, FL","{'id': '55b4f9e5c516e0b6', 'url': 'https://api...","VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQ...",False,15349596,692,45568,...,http://pbs.twimg.com/profile_images/1673907275...,https://pbs.twimg.com/profile_images/167390727...,2FC2EF,181A1E,252429,666666,True,False,False,False
1,1297437077403885568,Jennifer Fishpaw,JenniferFishpaw,,,,False,0,44,0,...,http://pbs.twimg.com/profile_images/1297437406...,https://pbs.twimg.com/profile_images/129743740...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False
2,17685258,Brad Parscale,parscale,Florida,,Owner @ Parscale Strategy. Senior Advisor Digi...,False,762839,475,3201,...,http://pbs.twimg.com/profile_images/1295453225...,https://pbs.twimg.com/profile_images/129545322...,AB2316,FFFFFF,FFFFFF,666666,False,False,False,False
3,15750898,FOX 13 Tampa Bay,FOX13News,"Tampa, FL",,Bringing you the important stuff like breaking...,False,327587,4801,1744,...,http://pbs.twimg.com/profile_images/1293193013...,https://pbs.twimg.com/profile_images/129319301...,0B2F8A,FFFFFF,E8EEF0,333333,True,False,False,False
4,1659167666,Vonte The Plug 🎤🔌,VonteThePlugNC,"Jacksonville Beach, FL","{'id': '5e281c17a74c170f', 'url': 'https://api...",MOTIVATION 3 OUT NOW 🔥 Singles: ‘Lil Shawdy’ &...,False,13324,647,44,...,http://pbs.twimg.com/profile_images/1181662400...,https://pbs.twimg.com/profile_images/118166240...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False


In [None]:
df_profile['default_profile'].nunique()

2

In [None]:
df_profile.columns

Index(['ID', 'name', 'screen_name', 'location', 'profile_location',
       'description', 'protected', 'followers_count', 'friends_count',
       'listed_count', 'created_at', 'favourites_count', 'utc_offset',
       'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang',
       'contributors_enabled', 'is_translator', 'is_translation_enabled',
       'profile_background_color', 'profile_background_image_url',
       'profile_background_image_url_https', 'profile_background_tile',
       'profile_image_url', 'profile_image_url_https', 'profile_link_color',
       'profile_sidebar_border_color', 'profile_sidebar_fill_color',
       'profile_text_color', 'profile_use_background_image',
       'has_extended_profile', 'default_profile', 'default_profile_image'],
      dtype='object')

In [None]:
df_profile['id'].nunique()

9461

# Get following and followers for each user

In [None]:
# For ID, followers, and following DataFrame
df_id_neighbor_train = create_id_neighbor_df("../Datasets/Raw/Twi20/train.json")
df_id_neighbor_test = create_id_neighbor_df("../Datasets/Raw/Twi20/test.json")
df_id_neighbor = pd.concat([df_id_neighbor_train, df_id_neighbor_test], axis=0)
df_id_neighbor = df_id_neighbor.reset_index(drop=True)
df_id_neighbor

Unnamed: 0,ID,followers,following
0,17461978,,
1,1297437077403885568,[],"[170861207, 23970102, 47293791, 29458079, 1799..."
2,17685258,"[1275068515666386945, 2535843469, 129365759103...","[46464108, 21536398, 18643437, 589490020, 1363..."
3,15750898,"[855194021458739200, 1267566832598290432, 1290...","[2324715174, 24030137, 2336676015, 192684124, ..."
4,1659167666,"[893137540185718785, 1063858543, 26665819, 241...","[1628313708, 726405625, 130868956, 26652768, 3..."
...,...,...,...
9456,452754350,"[2308703630, 230020648, 20673104, 818336445102...","[2924422992, 2365623499, 3383893516, 304921770..."
9457,850435801687183360,"[333490198, 905966469929979904, 12875470492238...","[704144006129692674, 953363306244227072, 84551..."
9458,2188795745,"[249907794, 4843189571, 694904945393426432, 29...","[66762778, 2981733093, 186186153, 198600462, 7..."
9459,940687680,"[942435278, 280899355, 1262431498751184896, 13...","[559791853, 1008065499136249856, 107059213, 36..."


# Create the labels

In [None]:
# For ID and label DataFrame
df_id_label_train = create_id_label_df("../Datasets/Raw/Twi20/train.json")
df_id_label_test = create_id_label_df("../Datasets/Raw/Twi20/test.json")
df_id_label = pd.concat([df_id_label_train, df_id_label_test], axis=0)
df_id_label = df_id_label.reset_index(drop=True)

In [None]:
df_id_label

Unnamed: 0,ID,label
0,17461978,0
1,1297437077403885568,1
2,17685258,0
3,15750898,0
4,1659167666,1
...,...,...
9456,452754350,1
9457,850435801687183360,1
9458,2188795745,1
9459,940687680,1


# Save datasets

In [None]:
df_id_label.to_parquet('../Datasets/Processed/ETL/Twi20_label.parquet')
df_tweets.to_parquet('../Datasets/Processed/ETL/Twi20_tweets.parquet')
df_id_neighbor.to_parquet('../Datasets/Processed/ETL/Twi20_neighbor.parquet')
df_id_domain.to_parquet('../Datasets/Processed/ETL/Twi20_domain.parquet')
df_profile.to_parquet('../Datasets/Processed/ETL/Twi20_profile.parquet')