In [75]:
import pandas as pd
import sys
import numpy as np
sys.path.append('../utils')
from ETL_utils import*

# Read Data

In [3]:
# Lendo cada arquivo parquet
df_id_label = pd.read_parquet('../Data/Twi20/ETL/Twi20_label.parquet')
df_tweets = pd.read_parquet('../Data/Twi20/ETL/Twi20_tweets.parquet')
df_id_neighbor = pd.read_parquet('../Data/Twi20/ETL/Twi20_neighbor.parquet')
df_id_domain = pd.read_parquet('../Data/Twi20/ETL/Twi20_domain.parquet')
df_profile = pd.read_parquet('../Data/Twi20/ETL/Twi20_profile.parquet')

## Twitter User Object Fields

Used link: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/user

Below are descriptions of the fields found in a Twitter user object:

- **ID (String)**: The string representation of the unique identifier for the User. Should be used instead of the integer id. Example:
  - `"ID": "6253282"`

- **name (String)**: The name of the user as they've defined it. This is not necessarily a person’s name. Example:
  - `"name": "Twitter API"`

- **screen_name (String)**: The screen name, handle, or alias of the user. Screen names are unique but subject to change. Example:
  - `"screen_name": "twitterapi"`

- **location (String)**: Nullable. The user-defined location for the account's profile, which is not guaranteed to be a real location. Example:
  - `"location": "San Francisco, CA"`


- **url (String)**: Nullable. A URL provided by the user in association with their profile. Example:
  - `"url": "https://developer.twitter.com"`

- **description (String)**: Nullable. A UTF-8 string describing the user's account. Example:
  - `"description": "The Real Twitter API."`

- **protected (Boolean)**: When `true`, indicates that the user has chosen to protect their Tweets. Example:
  - `"protected": true`

- **verified (Boolean)**: When `true`, indicates that the user has a verified account. Example:
  - `"verified": false`

- **followers_count (Int)**: The number of followers the user has. Example:
  - `"followers_count": 21`

- **friends_count (Int)**: The number of users the account is following. Example:
  - `"friends_count": 32`

- **listed_count (Int)**: The number of public lists the user is a member of. Example:
  - `"listed_count": 9274`

- **favourites_count (Int)**: The number of tweets the user has liked in the account’s lifetime. Example:
  - `"favourites_count": 13`

- **statuses_count (Int)**: The number of Tweets, including retweets, issued by the user. Example:
  - `"statuses_count": 42`

- **created_at (String)**: The UTC datetime the user account was created on Twitter. Example:
  - `"created_at": "Mon Nov 29 21:18:15 +0000 2010"`

- **profile_banner_url (String)**: A URL pointing to the user's profile banner. Example:
  - `"profile_banner_url": "https://si0.twimg.com/profile_banners/819797/1348102824"`

- **profile_image_url_https (String)**: A URL pointing to the user's profile image. Example:
  - `"profile_image_url_https": "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png"`

- **default_profile (Boolean)**: Indicates if the user has the default theme or background. Example:
  - `"default_profile": false`

- **default_profile_image (Boolean)**: Indicates if the user has the default profile image. Example:
  - `"default_profile_image": false`

- **withheld_in_countries (Array of String)**: Lists country codes where the content is withheld. Example:
  - `"withheld_in_countries": ["GR", "HK", "MY"]`

- **withheld_scope (String)**: Indicates if the content withheld is a "user." Example:
  - `"withheld_scope": "user"`


In [5]:
df_id_domain

Unnamed: 0,ID,domain
0,17461978,"Politics, Business, Entertainment"
1,1297437077403885568,Politics
2,17685258,"Politics, Entertainment, Sports"
3,15750898,Politics
4,1659167666,Politics
...,...,...
9456,452754350,Sports
9457,850435801687183360,Sports
9458,2188795745,Sports
9459,940687680,Sports


## Columns to maintain

- ID
- name
- screen_name
- location
- description
- protection
- followers_count
- friends_count
- listed_count
- favourites_counts
- statused_count
- created_at
- geo_enabled
- verified
- profile_background_color
- profile_background_image_url_https
- profile_image_url_https
- profile_link_color
- profile_sidebar_border_color
- profile_sidebar_fill_color
- profile_text_color
- profile_use_background_image
- default_profile
- default_profile_image
- domain








In [66]:
columns_to_keep = [
    'ID', 'name', 'screen_name', 'location', 'description',
    'protected', 'followers_count', 'friends_count', 'listed_count',
    'favourites_count', 'statuses_count', 'created_at', 'geo_enabled',
    'verified', 'profile_background_color',
    'profile_background_image_url_https', 'profile_image_url_https',
    'profile_link_color', 'profile_sidebar_border_color',
    'profile_sidebar_fill_color', 'profile_text_color',
    'profile_use_background_image', 'default_profile',
    'default_profile_image']

# Filtrando o DataFrame para manter apenas as colunas selecionadas
df_profile = df_profile[columns_to_keep]

In [67]:
df_profile['domain'] = df_id_domain['domain']
df_profile['label'] = df_id_label['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_profile['domain'] = df_id_domain['domain']


### Convert True/False to Boolean

In [69]:
# Lista de colunas que contêm valores booleanos como strings
boolean_columns = ['protected', 'geo_enabled', 'verified', 
    'profile_use_background_image', 'default_profile', 
    'default_profile_image'
]

# Função para limpar a string e converter para booleano
def clean_and_convert_to_bool(value):
    if isinstance(value, str):
        return value.strip().lower() == 'true'
    return value

# Aplicando a função para cada coluna booleana
for column in boolean_columns:
    df_profile[column] = df_profile[column].apply(clean_and_convert_to_bool)

In [74]:
# Assuming your DataFrame is named df
rows_with_empty_location = df_profile['location'].apply(lambda x: x.strip() == '').sum()

# Or, if by "empty" you mean both NaN and empty strings:
rows_with_empty_location_or_blank = (df_profile['location'].isna() | (df_profile['location'] == '')).sum()

print(f"Number of rows with empty 'location': {rows_with_empty_location}")
print(f"Number of rows with empty or blank 'location': {rows_with_empty_location_or_blank}")

Number of rows with empty 'location': 2758
Number of rows with empty or blank 'location': 0


In [76]:
df_profile['location'].replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [79]:
# Or, if by "empty" you mean both NaN and empty strings:
rows_with_empty_location_or_blank = (df_profile['location'].isna() | (df_profile['location'] == '')).sum()

print(f"Number of rows with empty or blank 'location': {rows_with_empty_location_or_blank}")

Number of rows with empty or blank 'location': 2758


In [71]:
df_profile.to_parquet('../Data/Twi20/ETL/ETL2_Twi20_profile.parquet')