In [18]:
import pandas as pd
from utilities import to_float, select_best_created_at, len_with_int, get_path
import numpy as np

In [19]:
DATA_PATH = get_path()

In [20]:
df_tweets = pd.read_csv(DATA_PATH + 'tweets.csv')

In [21]:
df_users = pd.read_csv(DATA_PATH + 'users.csv')

In [22]:
df_users.head()

Unnamed: 0,id,name,lang,bot,created_at,statuses_count
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54.0
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3.0
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50.0
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085.0


In [23]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11508 entries, 0 to 11507
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              11508 non-null  int64  
 1   name            11507 non-null  object 
 2   lang            11508 non-null  object 
 3   bot             11508 non-null  int64  
 4   created_at      11508 non-null  object 
 5   statuses_count  11109 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 539.6+ KB


Check if any user record has a null value in it
Extract only record with a null name (id = 1535)

In [24]:
df_null = df_users.isnull()
idx_null = df_users.index[df_null["name"] == True].tolist()
df_users.iloc[idx_null]

Unnamed: 0,id,name,lang,bot,created_at,statuses_count
1535,2166124159,,en,0,2018-11-02 06:39:14,6566.0


In [25]:
df_tweets.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13664696 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Non-Null Count     Dtype 
---  ------          --------------     ----- 
 0   id              13664694 non-null  object
 1   user_id         13447413 non-null  object
 2   retweet_count   13227562 non-null  object
 3   reply_count     13016818 non-null  object
 4   favorite_count  13017154 non-null  object
 5   num_hashtags    12607172 non-null  object
 6   num_urls        13016073 non-null  object
 7   num_mentions    12810531 non-null  object
 8   created_at      13664696 non-null  object
 9   text            13126975 non-null  object
dtypes: object(10)
memory usage: 1.0+ GB


  df_tweets.info(null_counts=True)


In [26]:
df_tweets.size

136646960

## Check for duplicate records

In [27]:
df_tweets.duplicated().value_counts()

False    11712597
True      1952099
dtype: int64

#  Null user_id and text
We remove the records that have invalid user_id and text, since these records can't be used neither to analyze user behaviour, neither to perform any kind of topic analysis on the text. The only utility that they can have could be related to the twitter density for a given period of time, but we go for the deletion since 56280 is not a great amount of records w.r.t. the total number of tweets. Also, notice that the counters have several null values.

In [28]:
df_tweets = df_tweets.drop(df_tweets.index[df_tweets.user_id.isnull() & df_tweets.text.isnull()])

## File Checkpoint

In [29]:
df_tweets.to_csv(path_or_buf=DATA_PATH + 'tweets_no_dupl.csv', sep='#', index=False)

In [31]:
df_tweets = pd.read_csv(DATA_PATH + 'tweets_no_dupl.csv', sep='#')

## 1.1 Drop duplicates over all columns
First, we drop the duplicates by all attributes

In [32]:
df_tweets = df_tweets.drop_duplicates(keep='first')
df_tweets.duplicated().value_counts()

False    11656230
dtype: int64

In [33]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11656230 entries, 0 to 13608328
Data columns (total 10 columns):
 #   Column          Non-Null Count     Dtype 
---  ------          --------------     ----- 
 0   id              11656228 non-null  object
 1   user_id         11495314 non-null  object
 2   retweet_count   11223759 non-null  object
 3   reply_count     11038934 non-null  object
 4   favorite_count  11014021 non-null  object
 5   num_hashtags    10654681 non-null  object
 6   num_urls        11037813 non-null  object
 7   num_mentions    10857966 non-null  object
 8   created_at      11656230 non-null  object
 9   text            11182415 non-null  object
dtypes: object(10)
memory usage: 978.2+ MB


## File checkpoint

In [34]:
df_tweets.to_csv(path_or_buf=DATA_PATH + 'tweets_no_dupl_1.1.csv', sep='#', index=False)

In [35]:
df_tweets = pd.read_csv(DATA_PATH + 'tweets_no_dupl_1.1.csv', sep='#')

# Checking for duplicates on a subset of columns

### ID
The are a considerable amount of tweets with same id, but we can't consider them duplicates since the id has no semantic that we can check.

In [36]:
df_tweets.duplicated(subset=['id']).value_counts()

False    11618246
True        37984
dtype: int64

In [37]:
df_tweets.duplicated(subset=['id', 'user_id']).value_counts()

False    11656119
True          111
dtype: int64

In [38]:
df_tweets.duplicated(subset=['id', 'user_id', 'text']).value_counts()

False    11656218
True           12
dtype: int64

In [39]:
df_tweets.duplicated(subset=['id', 'user_id', 'created_at', 'text']).value_counts()

False    11656230
dtype: int64

# 1.2 ['user_id', 'text','created_at']
Considering the triple  `['user_id', 'text','created_at']`, we can see that about 10% of the data consists in duplicated records and we are not able to detect them if we include the `id` value in the duplicate definition.

In [40]:
duplicates_bool = df_tweets.duplicated(subset=['user_id', 'text', 'created_at'], keep=False)
df_tweets[duplicates_bool].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094281 entries, 6 to 11656225
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   id              2094281 non-null  object
 1   user_id         2094281 non-null  object
 2   retweet_count   1798357 non-null  object
 3   reply_count     1798162 non-null  object
 4   favorite_count  1646543 non-null  object
 5   num_hashtags    1644687 non-null  object
 6   num_urls        1797622 non-null  object
 7   num_mentions    1797318 non-null  object
 8   created_at      2094281 non-null  object
 9   text            2082549 non-null  object
dtypes: object(10)
memory usage: 175.8+ MB


We can observe that the other columns of these duplicates have a considerable amount of null values. In order to keep the correct values, we proceed by performing a merge of the copies, keeping the non null value.
We first convert the counts and nums to numeric type, setting to -1 the meaningless values.
Then, the merge will proceed by taking the max over the duplicates.

In [41]:
for attr in ['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions', 'user_id']:
    df_tweets[attr + '_conv'] = df_tweets[attr].apply(to_float)

# recreate the index column for the groupby
df_tweets = df_tweets.reset_index()
df_tweets.index = df_tweets['index']

In [42]:
aggregated = df_tweets[duplicates_bool].groupby(['user_id', 'text', 'created_at']).agg(
    {'retweet_count_conv': max, 'reply_count_conv': max, 'favorite_count_conv': max,
     'num_hashtags_conv': max, 'num_urls_conv': max, 'num_mentions_conv': max,
     'id': tuple, 'index': min}).reset_index()

In [43]:
aggregated.index = aggregated['index']

In [44]:
attributes = ['retweet_count_conv', 'reply_count_conv', 'favorite_count_conv', 'num_hashtags_conv', 'num_mentions_conv',
              'num_urls_conv']
df_tweets.loc[aggregated.index, attributes] = aggregated[attributes]

In [45]:
# reset the -1 to NaN to check how many null values have been removed
for a in attributes:
    df_tweets[a] = df_tweets[a].replace(-1, np.NaN)

df_tweets[duplicates_bool].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094281 entries, 6 to 11656225
Data columns (total 18 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   index                2094281 non-null  int64  
 1   id                   2094281 non-null  object 
 2   user_id              2094281 non-null  object 
 3   retweet_count        1798357 non-null  object 
 4   reply_count          1798162 non-null  object 
 5   favorite_count       1646543 non-null  object 
 6   num_hashtags         1644687 non-null  object 
 7   num_urls             1797622 non-null  object 
 8   num_mentions         1797318 non-null  object 
 9   created_at           2094281 non-null  object 
 10  text                 2082549 non-null  object 
 11  retweet_count_conv   1873666 non-null  float64
 12  reply_count_conv     1873258 non-null  float64
 13  favorite_count_conv  1805122 non-null  float64
 14  num_hashtags_conv    1804057 non-null  float64
 1

In [46]:
df_tweets.drop_duplicates(['user_id', 'text', 'created_at'], inplace=True)

In [47]:
# check that the non nan copies have been kept
df_tweets[df_tweets.index.isin(aggregated.index)].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041126 entries, 6 to 11643564
Data columns (total 18 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   index                1041126 non-null  int64  
 1   id                   1041126 non-null  object 
 2   user_id              1041126 non-null  object 
 3   retweet_count        903874 non-null   object 
 4   reply_count          903763 non-null   object 
 5   favorite_count       832944 non-null   object 
 6   num_hashtags         832871 non-null   object 
 7   num_urls             903390 non-null   object 
 8   num_mentions         903277 non-null   object 
 9   created_at           1041126 non-null  object 
 10  text                 1041126 non-null  object 
 11  retweet_count_conv   1041126 non-null  float64
 12  reply_count_conv     1041126 non-null  float64
 13  favorite_count_conv  1041126 non-null  float64
 14  num_hashtags_conv    1041126 non-null  float64
 1

Everything seems fine, so we rename the 'conv' columns to the original name and drop the uncleaned ones.

In [48]:
dropped = df_tweets.drop(
    columns=['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions', 'index'])

In [49]:
df_tweets = dropped.rename(columns={'retweet_count_conv': 'retweet_count',
                                    'reply_count_conv': 'reply_count',
                                    'favorite_count_conv': 'favorite_count',
                                    'num_hashtags_conv': 'num_hashtags',
                                    'num_urls_conv': 'num_urls',
                                    'num_mentions_conv': 'num_mentions'})

In [50]:
df_tweets = df_tweets.reset_index()
df_tweets = df_tweets.drop(columns=['index'])

In [51]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10608879 entries, 0 to 10608878
Data columns (total 11 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   id              10608877 non-null  object 
 1   user_id         10447963 non-null  object 
 2   created_at      10608879 non-null  object 
 3   text            10140992 non-null  object 
 4   retweet_count   10402486 non-null  float64
 5   reply_count     10243011 non-null  float64
 6   favorite_count  10348371 non-null  float64
 7   num_hashtags    10040725 non-null  float64
 8   num_urls        10242724 non-null  float64
 9   num_mentions    10087919 non-null  float64
 10  user_id_conv    10608879 non-null  float64
dtypes: float64(7), object(4)
memory usage: 890.3+ MB


## File checkpoint

In [52]:
df_tweets.to_csv(path_or_buf=DATA_PATH + 'tweets_no_dupl_user_text_createdat.csv', sep='#', index=False)

In [53]:
df_tweets = pd.read_csv(DATA_PATH + 'tweets_no_dupl_user_text_createdat.csv', sep='#')

# Check how many duplicates we have for each attribute

In [54]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10608879 entries, 0 to 10608878
Data columns (total 11 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   id              10608877 non-null  object 
 1   user_id         10447963 non-null  object 
 2   created_at      10608879 non-null  object 
 3   text            10140992 non-null  object 
 4   retweet_count   10402486 non-null  float64
 5   reply_count     10243011 non-null  float64
 6   favorite_count  10348371 non-null  float64
 7   num_hashtags    10040725 non-null  float64
 8   num_urls        10242724 non-null  float64
 9   num_mentions    10087919 non-null  float64
 10  user_id_conv    10608879 non-null  float64
dtypes: float64(7), object(4)
memory usage: 890.3+ MB


In [55]:
df_tweets[df_tweets.user_id.isnull() & df_tweets.text.isnull()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              0 non-null      object 
 1   user_id         0 non-null      object 
 2   created_at      0 non-null      object 
 3   text            0 non-null      object 
 4   retweet_count   0 non-null      float64
 5   reply_count     0 non-null      float64
 6   favorite_count  0 non-null      float64
 7   num_hashtags    0 non-null      float64
 8   num_urls        0 non-null      float64
 9   num_mentions    0 non-null      float64
 10  user_id_conv    0 non-null      float64
dtypes: float64(7), object(4)
memory usage: 0.0+ bytes


In [56]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10608879 entries, 0 to 10608878
Data columns (total 11 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   id              10608877 non-null  object 
 1   user_id         10447963 non-null  object 
 2   created_at      10608879 non-null  object 
 3   text            10140992 non-null  object 
 4   retweet_count   10402486 non-null  float64
 5   reply_count     10243011 non-null  float64
 6   favorite_count  10348371 non-null  float64
 7   num_hashtags    10040725 non-null  float64
 8   num_urls        10242724 non-null  float64
 9   num_mentions    10087919 non-null  float64
 10  user_id_conv    10608879 non-null  float64
dtypes: float64(7), object(4)
memory usage: 890.3+ MB


# 1.3 [text, created_at]
We now consider the situation where `text` and `created_at` are equal among different records. Despite is certainly possible that two users post the same text at the same time, the situation is suspicious, and we check the possibility that the two records are the same tweet from the same user, but with mispelled `user_id`.
For this analysis we consider only the tweets with non-null text.

In [57]:
duplicates_bool = ~df_tweets.text.isnull() & df_tweets.duplicated(subset=['text', 'created_at'], keep=False)
duplicates_bool.value_counts()

False    9962254
True      646625
dtype: int64

In [58]:
df_tweets[duplicates_bool & (df_tweets.user_id_conv == -1)].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 321006 entries, 83 to 10608844
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              321006 non-null  object 
 1   user_id         160090 non-null  object 
 2   created_at      321006 non-null  object 
 3   text            321006 non-null  object 
 4   retweet_count   230806 non-null  float64
 5   reply_count     230909 non-null  float64
 6   favorite_count  184227 non-null  float64
 7   num_hashtags    184243 non-null  float64
 8   num_urls        230847 non-null  float64
 9   num_mentions    231042 non-null  float64
 10  user_id_conv    321006 non-null  float64
dtypes: float64(7), object(4)
memory usage: 29.4+ MB


We consider a NaN or alphanumeric `user_id` to be wrong, since the correct format is the one with only numerical characters.
Out of the 646625 duplicates by `[text,created_at]`, 321006 have wrong `user_id`.
We proceed by keeping the copy with the right `user_id`, and selecting the counter attributes by a max operation, as we did at step **1.2**.

In [59]:
for a in ['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions']:
    df_tweets[a] = df_tweets[a].replace(np.NaN, -1)

In [60]:
# recreate the index column for the groupby
df_tweets = df_tweets.reset_index()
df_tweets.index = df_tweets['index']

In [61]:
def select_best_user_id(user_ids):
    greater_than_zero = []
    smaller_than_zero = []
    for id in user_ids:
        if id > 0:
            greater_than_zero.append(id)
        else:
            smaller_than_zero.append(id)

    # if all but one are < 0, keep the one that is > 0
    if len(greater_than_zero) == 1:
        return greater_than_zero[0]
    if len(greater_than_zero) > 1:
        return tuple(greater_than_zero)

In [62]:
aggregated = df_tweets[duplicates_bool].groupby(['text', 'created_at']).agg(
    {'retweet_count': max, 'reply_count': max, 'favorite_count': max,
     'num_hashtags': max, 'num_urls': max, 'num_mentions': max,
     'user_id_conv': select_best_user_id,
     'index': min}).reset_index()

In [63]:
aggregated.index = aggregated['index']

In [64]:
tweets_with_valid_user_id = aggregated.user_id_conv.apply(type) == float
tweets_with_valid_user_id.value_counts()

True     320882
False      2339
Name: user_id_conv, dtype: int64

In [65]:
aggregated_to_replace = aggregated[tweets_with_valid_user_id]

In [66]:
attributes = ['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_mentions', 'num_urls',
              'user_id_conv']
df_tweets.loc[aggregated_to_replace.index, attributes] = aggregated_to_replace[attributes]

In [67]:
indices_to_drop = duplicates_bool & ~df_tweets.index.isin(aggregated.index)
indices_to_drop.value_counts()

False    10285475
True       323404
dtype: int64

In [68]:
df_tweets = df_tweets[~indices_to_drop]

In [69]:
df_tweets = df_tweets.drop(columns=['index'])

In [70]:
df_tweets = df_tweets.reset_index()

In [71]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10285475 entries, 0 to 10285474
Data columns (total 12 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   index           10285475 non-null  int64  
 1   id              10285473 non-null  object 
 2   user_id         10210489 non-null  object 
 3   created_at      10285475 non-null  object 
 4   text            9817588 non-null   object 
 5   retweet_count   10285475 non-null  float64
 6   reply_count     10285475 non-null  float64
 7   favorite_count  10285475 non-null  float64
 8   num_hashtags    10285475 non-null  float64
 9   num_urls        10285475 non-null  float64
 10  num_mentions    10285475 non-null  float64
 11  user_id_conv    10285475 non-null  object 
dtypes: float64(6), int64(1), object(5)
memory usage: 941.7+ MB


## File checkpoint

In [72]:
df_tweets.to_csv(path_or_buf=DATA_PATH + 'tweets_no_dupl_1.3.csv', sep='#', index=False)

In [73]:
df_tweets = pd.read_csv(DATA_PATH + 'tweets_no_dupl_1.3.csv', sep='#')

## Additional checks of duplicates

In [74]:
for attr in ['id', ['id', 'user_id'], ['id', 'created_at'], ['id', 'user_id', 'created_at'], ['user_id', 'text'],
             ['id', 'user_id', 'text']]:
    counts = df_tweets.duplicated(attr).value_counts()
    if len(counts) > 1:
        dupl_count = counts[1]
    else:
        dupl_count = 0
    print(f"{attr} {dupl_count}")

id 28418
['id', 'user_id'] 38
['id', 'created_at'] 0
['id', 'user_id', 'created_at'] 0
['user_id', 'text'] 1389039
['id', 'user_id', 'text'] 12


# 1.4 [user_id,text]

In [75]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10285475 entries, 0 to 10285474
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   index           int64  
 1   id              object 
 2   user_id         object 
 3   created_at      object 
 4   text            object 
 5   retweet_count   float64
 6   reply_count     float64
 7   favorite_count  float64
 8   num_hashtags    float64
 9   num_urls        float64
 10  num_mentions    float64
 11  user_id_conv    float64
dtypes: float64(7), int64(1), object(4)
memory usage: 941.7+ MB


The duplicates by ['user_id', 'text'] are 1389039, which is a considerable amount and is worth further investigation. It's surely possible that a user, especially if it is a bot, tweets many times the same text. What we want to check is just that among these duplicates all the dates are valid, which is, in a range that goes from the tweeter foundation up to september 2022. If the dates are valid, we keep the tweets. Otherwise it is considered a noisy duplicate and is removed.

In [76]:
twitter_foundation = pd.to_datetime(["20060321"]).astype(np.int64)[0]
sep_2022 = pd.to_datetime(["20220915"]).astype(np.int64)[0]

In [77]:
def wrong_date(date):
    return (date < twitter_foundation) | (date > sep_2022)

In [78]:
df_tweets['created_at_conv'] = pd.to_datetime(df_tweets['created_at']).astype(np.int64)

In [79]:
duplicates_bool = df_tweets.duplicated(['user_id', 'text'], keep=False)
duplicates_bool.value_counts()

False    8259264
True     2026211
dtype: int64

In [80]:
for attr in ['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_mentions', 'num_urls']:
    df_tweets[attr].replace(-1, np.NaN)

df_tweets[duplicates_bool][wrong_date(df_tweets.created_at_conv)].info(show_counts=True)

  df_tweets[duplicates_bool][wrong_date(df_tweets.created_at_conv)].info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 97602 entries, 222 to 10285466
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            97602 non-null  int64  
 1   id               97602 non-null  object 
 2   user_id          97602 non-null  object 
 3   created_at       97602 non-null  object 
 4   text             97236 non-null  object 
 5   retweet_count    97602 non-null  float64
 6   reply_count      97602 non-null  float64
 7   favorite_count   97602 non-null  float64
 8   num_hashtags     97602 non-null  float64
 9   num_urls         97602 non-null  float64
 10  num_mentions     97602 non-null  float64
 11  user_id_conv     97602 non-null  float64
 12  created_at_conv  97602 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 10.4+ MB


We will not aggregate the counters using a max operator as in previous steps, since the multiple "copies" in this case could be just a periodic tweet. Furthermore, there is no null value in the counters.
Given that there is no null value in the other fields, we will simply remove the records having a wrong date.

In [81]:
df_tweets[duplicates_bool].created_at_conv.apply(wrong_date).value_counts()

False    1928609
True       97602
Name: created_at_conv, dtype: int64

In these duplicates 97602 records have wrong created_at.

In [82]:
indices_to_drop = ~(duplicates_bool & df_tweets[duplicates_bool].created_at_conv.apply(wrong_date))
indices_to_drop.value_counts()

True     10187873
False       97602
dtype: int64

In [83]:
df_tweets = df_tweets[indices_to_drop]

## File checkpoint

In [84]:
df_tweets = df_tweets.drop(columns='index').reset_index()
df_tweets.to_csv(path_or_buf=DATA_PATH + 'tweets_no_dupl_1.4.csv', sep='#', index=False)

In [85]:
df_tweets = pd.read_csv(DATA_PATH + 'tweets_no_dupl_1.4.csv', sep='#')

# 1.5 [user_id,created_at]

In [86]:
for attr in ['id', ['id', 'user_id'], ['id', 'created_at'], ['user_id', 'created_at'], ['created_at', 'text'],
             ['id', 'user_id', 'created_at'], ['user_id', 'text'], ['id', 'user_id', 'text']]:
    counts = df_tweets.duplicated(attr).value_counts()
    if len(counts) > 1:
        dupl_count = counts[1]
    else:
        dupl_count = 0
    print(f"{attr} {dupl_count}")

id 28402
['id', 'user_id'] 38
['id', 'created_at'] 0
['user_id', 'created_at'] 403327
['created_at', 'text'] 5946
['id', 'user_id', 'created_at'] 0
['user_id', 'text'] 1291437
['id', 'user_id', 'text'] 12


In [87]:
duplicates_bool = df_tweets.duplicated(['user_id', 'created_at'], keep=False) & ~df_tweets.user_id.isnull()
duplicates_bool.value_counts()

False    9394762
True      793111
dtype: int64

The duplicates that have non-null text also have non-null counters

In [88]:
df_tweets[duplicates_bool & ~df_tweets.text.isnull()].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420116 entries, 2 to 10187871
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   index            420116 non-null  int64  
 1   id               420116 non-null  object 
 2   user_id          420116 non-null  object 
 3   created_at       420116 non-null  object 
 4   text             420116 non-null  object 
 5   retweet_count    420116 non-null  float64
 6   reply_count      420116 non-null  float64
 7   favorite_count   420116 non-null  float64
 8   num_hashtags     420116 non-null  float64
 9   num_urls         420116 non-null  float64
 10  num_mentions     420116 non-null  float64
 11  user_id_conv     420116 non-null  float64
 12  created_at_conv  420116 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 44.9+ MB


In [89]:
df_tweets[duplicates_bool & df_tweets.text.isnull()].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372995 entries, 32 to 10187870
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   index            372995 non-null  int64  
 1   id               372993 non-null  object 
 2   user_id          372995 non-null  object 
 3   created_at       372995 non-null  object 
 4   text             0 non-null       object 
 5   retweet_count    372995 non-null  float64
 6   reply_count      372995 non-null  float64
 7   favorite_count   372995 non-null  float64
 8   num_hashtags     372995 non-null  float64
 9   num_urls         372995 non-null  float64
 10  num_mentions     372995 non-null  float64
 11  user_id_conv     372995 non-null  float64
 12  created_at_conv  372995 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 39.8+ MB


The duplicates removal procedeed with the merge procedure as in 1.2

In [90]:
for a in ['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions']:
    df_tweets[a] = df_tweets[a].replace(np.NaN, -1)

In [91]:
def select_non_null_text(texts):
    nonnull_texts = []
    for text in texts:
        if ~pd.Series(text).isnull()[0]:
            nonnull_texts.append(text)
    if len(nonnull_texts) == 1:
        return nonnull_texts[0]
    elif len(nonnull_texts) == 0:
        return np.NaN
    elif len(nonnull_texts) > 1:
        return nonnull_texts

In [92]:
df_tweets = df_tweets.drop(columns='index').reset_index()

In [93]:
groupby_userid_createdat = df_tweets[duplicates_bool].groupby(['user_id', 'created_at'])

In [94]:
aggregated = groupby_userid_createdat.agg(
    retweet_count=('retweet_count', max),
    reply_count=('reply_count', max),
    favorite_count=('favorite_count', max),
    num_hashtags=('num_hashtags', max),
    num_urls=('num_urls', max),
    num_mentions=('num_mentions', max),
    text=('text', select_non_null_text),
    Keep_index=('index', min),
    all_indices=('index', list)).reset_index()

In [95]:
aggregated

Unnamed: 0,user_id,created_at,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,text,Keep_index,all_indices
0,100043740,2019-04-13 00:53:45,5.0,0.0,8.0,0.0,0.0,1.0,Monacan Girls Soccer falls 3-0 to Midlothian a...,1858549,"[1858549, 2268739]"
1,100043740,2019-04-15 00:18:50,0.0,0.0,0.0,0.0,0.0,1.0,@Natty032,24709,"[24709, 8952293]"
2,100043740,2019-04-18 21:20:51,1.0,0.0,0.0,0.0,0.0,1.0,RT @The_Messiah_57: Neymar tricked it,976287,"[976287, 6317603]"
3,100043740,2019-04-18 23:04:51,0.0,0.0,1.0,0.0,0.0,1.0,@THE_thompson5 it's what I would envision you ...,3137394,"[3137394, 9786760]"
4,100043740,2019-04-21 22:07:28,0.0,0.0,0.0,0.0,0.0,1.0,@NygelLee_20 could say the same about KD could...,4331051,"[4331051, 9453499]"
...,...,...,...,...,...,...,...,...,...,...,...
389923,999662546,2020-03-23 05:49:37,2.0,0.0,0.0,0.0,0.0,1.0,RT @laurnaylor: FAB FILS ARE NATIONAL CHAMPS Y...,708890,"[708890, 2935888]"
389924,999662546,2020-03-23 05:51:07,25.0,0.0,0.0,0.0,0.0,1.0,RT @tristankennedy8: NATIONAL CHAMPIONS,3315989,"[3315989, 8115935]"
389925,999662546,2020-03-23 05:53:07,18.0,0.0,0.0,1.0,0.0,1.0,RT @klrenfro: So proud of all they accomplishe...,6253564,"[6253564, 9858793]"
389926,999662546,2020-03-30 20:48:54,1.0,0.0,1.0,0.0,0.0,1.0,@brianandrews43 so proud of you!,373820,"[373820, 8886312]"


In [96]:
aggregated.index = aggregated['Keep_index']

In [97]:
aggregated['text_type'] = aggregated.text.apply(type)
aggregated.text_type.value_counts()

<class 'str'>     371323
<class 'list'>     18605
Name: text_type, dtype: int64

In [98]:
aggregated_to_replace = aggregated[aggregated.text_type == str]

In [99]:
aggregated[aggregated.text_type == list].all_indices

Keep_index
2273218             [2273218, 9079798]
3837342             [3837342, 9026570]
2625568             [2625568, 4712729]
4828552             [4828552, 8507617]
495103               [495103, 2422781]
                      ...             
9789457            [9789457, 10092639]
6234073             [6234073, 8993786]
2823175    [2823175, 7817741, 9073954]
5082524             [5082524, 8776950]
264846                [264846, 339303]
Name: all_indices, Length: 18605, dtype: object

In [100]:
not_replaceable = np.concatenate(aggregated[aggregated.text_type == list].all_indices.values)

In [101]:
attributes = ['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_mentions', 'num_urls', 'text']
df_tweets.loc[aggregated_to_replace.index, attributes] = aggregated_to_replace[attributes]

In [102]:
indices_to_keep = ~(
            duplicates_bool.index.isin(aggregated_to_replace.index) & duplicates_bool.index.isin(not_replaceable))
pd.Series(indices_to_drop).value_counts()

True     10187873
False       97602
dtype: int64

In [103]:
df_tweets = df_tweets[indices_to_keep]

In [104]:
df_tweets = df_tweets.reset_index()
df_tweets = df_tweets.drop(columns=['index'])

In [105]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10187873 entries, 0 to 10187872
Data columns (total 13 columns):
 #   Column           Non-Null Count     Dtype  
---  ------           --------------     -----  
 0   level_0          10187873 non-null  int64  
 1   id               10187871 non-null  object 
 2   user_id          10112887 non-null  object 
 3   created_at       10187873 non-null  object 
 4   text             9894377 non-null   object 
 5   retweet_count    10187873 non-null  float64
 6   reply_count      10187873 non-null  float64
 7   favorite_count   10187873 non-null  float64
 8   num_hashtags     10187873 non-null  float64
 9   num_urls         10187873 non-null  float64
 10  num_mentions     10187873 non-null  float64
 11  user_id_conv     10187873 non-null  float64
 12  created_at_conv  10187873 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 1010.5+ MB


## File checkpoint

In [106]:
df_tweets.to_csv(path_or_buf=DATA_PATH + 'tweets_no_dupl_1.5.csv', sep='#', index=False)

In [107]:
df_tweets = pd.read_csv(DATA_PATH + 'tweets_no_dupl_1.5.csv', sep='#')

# Last checks

In [108]:
for attr in ['id', ['id', 'user_id'], ['id', 'created_at'], ['id', 'text'], ['id', 'user_id', 'text'], ['id','user_id','created_at'], \
            ['user_id', 'created_at'], ['user_id', 'text'], ['created_at', 'text']]:
    counts = df_tweets.duplicated(attr).value_counts()
    if len(counts) > 1:
        dupl_count = counts[1]
    else:
        dupl_count = 0
    print(f"{attr} {dupl_count}")

id 28402
['id', 'user_id'] 38
['id', 'created_at'] 0
['id', 'text'] 10592
['id', 'user_id', 'text'] 3
['id', 'user_id', 'created_at'] 0
['user_id', 'created_at'] 403327
['user_id', 'text'] 1292350
['created_at', 'text'] 177003


In [109]:
df_tweets[df_tweets.duplicated(['id', 'user_id', 'text'], keep=False)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 6361653 to 10093503
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   level_0          6 non-null      int64  
 1   id               6 non-null      object 
 2   user_id          6 non-null      object 
 3   created_at       6 non-null      object 
 4   text             0 non-null      object 
 5   retweet_count    6 non-null      float64
 6   reply_count      6 non-null      float64
 7   favorite_count   6 non-null      float64
 8   num_hashtags     6 non-null      float64
 9   num_urls         6 non-null      float64
 10  num_mentions     6 non-null      float64
 11  user_id_conv     6 non-null      float64
 12  created_at_conv  6 non-null      int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 672.0+ bytes
