In [1]:
import pandas as pd
from utilities import to_float
import numpy as np

In [2]:
DATA_PATH = 'data/'

In [3]:
df_tweets = pd.read_csv(DATA_PATH+'tweets.csv')

In [4]:
df_users = pd.read_csv(DATA_PATH+'users.csv')

In [5]:
df_users.head()

Unnamed: 0,id,name,lang,bot,created_at,statuses_count
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54.0
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3.0
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50.0
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085.0


In [6]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11508 entries, 0 to 11507
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              11508 non-null  int64  
 1   name            11507 non-null  object 
 2   lang            11508 non-null  object 
 3   bot             11508 non-null  int64  
 4   created_at      11508 non-null  object 
 5   statuses_count  11109 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 539.6+ KB


Check if any user record has a null value in it
Extract only record with a null name (id = 1535)

In [7]:
df_null = df_users.isnull()
idx_null = df_users.index[df_null["name"] == True].tolist()
df_users.iloc[idx_null]

Unnamed: 0,id,name,lang,bot,created_at,statuses_count
1535,2166124159,,en,0,2018-11-02 06:39:14,6566.0


In [8]:
df_tweets.info(null_counts=True)

  df_tweets.info(null_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13664696 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Non-Null Count     Dtype 
---  ------          --------------     ----- 
 0   id              13664694 non-null  object
 1   user_id         13447413 non-null  object
 2   retweet_count   13227562 non-null  object
 3   reply_count     13016818 non-null  object
 4   favorite_count  13017154 non-null  object
 5   num_hashtags    12607172 non-null  object
 6   num_urls        13016073 non-null  object
 7   num_mentions    12810531 non-null  object
 8   created_at      13664696 non-null  object
 9   text            13126975 non-null  object
dtypes: object(10)
memory usage: 1.0+ GB


In [9]:
df_tweets.size

136646960

## Verifica esistenza record duplicati

In [10]:
df_tweets.duplicated().value_counts()

False    11712597
True      1952099
dtype: int64

## 1.1 Drop duplicates over all columns
First, we drop the duplicates by all attributes

In [11]:
df_tweets = df_tweets.drop_duplicates(keep = 'first')
df_tweets.duplicated().value_counts()

False    11712597
dtype: int64

In [12]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11712597 entries, 0 to 13664695
Data columns (total 10 columns):
 #   Column          Non-Null Count     Dtype 
---  ------          --------------     ----- 
 0   id              11712595 non-null  object
 1   user_id         11495314 non-null  object
 2   retweet_count   11275463 non-null  object
 3   reply_count     11064719 non-null  object
 4   favorite_count  11065055 non-null  object
 5   num_hashtags    10655073 non-null  object
 6   num_urls        11063974 non-null  object
 7   num_mentions    10858432 non-null  object
 8   created_at      11712597 non-null  object
 9   text            11182415 non-null  object
dtypes: object(10)
memory usage: 983.0+ MB


## File checkpoint

In [13]:
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_no_dupl_1.1.csv', sep='#', index=False)

In [14]:
df_tweets = pd.read_csv(DATA_PATH+'tweets_no_dupl_1.1.csv', sep='#')

# Verifica di duplicati su un sottoinsieme di colonne

### ID
The are a considerable amount of tweets with same id, but we can't consider them duplicates since the id has no semantic that we can check.

In [15]:
df_tweets.duplicated(subset=['id']).value_counts()

False    11672136
True        40461
dtype: int64

In [16]:
df_tweets.duplicated(subset=['id', 'user_id']).value_counts()

False    11712237
True          360
dtype: int64

In [17]:
df_tweets.duplicated(subset=['id', 'user_id', 'text']).value_counts()

False    11712336
True          261
dtype: int64

In [18]:
df_tweets.duplicated(subset=['id', 'user_id', 'created_at', 'text']).value_counts()

False    11712597
dtype: int64

# 1.2 ['user_id', 'text','created_at']
Considering the triple  `['user_id', 'text','created_at']`, we can see that about 10% of the data consists in duplicated records and we are not able to detect them if we include the `id` value in the duplicate definition.

In [21]:
duplicates_bool = df_tweets.duplicated(subset=['user_id', 'text','created_at'], keep=False)
df_tweets[duplicates_bool].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094454 entries, 6 to 11712592
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   id              2094454 non-null  object
 1   user_id         2094281 non-null  object
 2   retweet_count   1798521 non-null  object
 3   reply_count     1798239 non-null  object
 4   favorite_count  1646696 non-null  object
 5   num_hashtags    1644691 non-null  object
 6   num_urls        1797710 non-null  object
 7   num_mentions    1797322 non-null  object
 8   created_at      2094454 non-null  object
 9   text            2082549 non-null  object
dtypes: object(10)
memory usage: 175.8+ MB


We can observe that the other columns of these duplicates have a considerable amount of null values. In order to keep the correct values, we proceed by performing a merge of the copies, keeping the non null value.
We first convert the counts and nums to numeric type, setting to -1 the meaningless values.
Then, the merge will proceed by taking the max over the duplicates.

In [22]:
for attr in ['retweet_count','reply_count','favorite_count','num_hashtags','num_urls','num_mentions','user_id']:
    df_tweets[attr+'_conv'] = df_tweets[attr].apply(to_float)

# recreate the index column for the groupby
df_tweets = df_tweets.reset_index()
df_tweets.index = df_tweets['index']

In [23]:
aggregated = df_tweets[duplicates_bool]\
    .groupby(['user_id', 'text','created_at'])\
    .agg({'retweet_count_conv':max,'reply_count_conv':max,'favorite_count_conv':max,
          'num_hashtags_conv':max,'num_urls_conv':max,'num_mentions_conv':max,
          'id':tuple, 'index':min}).reset_index()

In [24]:
aggregated.index = aggregated['index']

In [25]:
attributes=['retweet_count_conv','reply_count_conv','favorite_count_conv','num_hashtags_conv','num_mentions_conv','num_urls_conv']
df_tweets.loc[aggregated.index,attributes] = aggregated[attributes]

In [26]:
# reset the -1 to NaN to check how many null values have been removed
for a in attributes:
    df_tweets[a] = df_tweets[a].replace(-1, np.NaN)

df_tweets[duplicates_bool].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094454 entries, 6 to 11712592
Data columns (total 18 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   index                2094454 non-null  int64  
 1   id                   2094454 non-null  object 
 2   user_id              2094281 non-null  object 
 3   retweet_count        1798521 non-null  object 
 4   reply_count          1798239 non-null  object 
 5   favorite_count       1646696 non-null  object 
 6   num_hashtags         1644691 non-null  object 
 7   num_urls             1797710 non-null  object 
 8   num_mentions         1797322 non-null  object 
 9   created_at           2094454 non-null  object 
 10  text                 2082549 non-null  object 
 11  retweet_count_conv   1873818 non-null  float64
 12  reply_count_conv     1873331 non-null  float64
 13  favorite_count_conv  1805259 non-null  float64
 14  num_hashtags_conv    1804061 non-null  float64
 1

In [28]:
df_tweets.drop_duplicates(['user_id', 'text','created_at'], inplace=True)

In [29]:
# check that the non nan copies have been kept
df_tweets[df_tweets.index.isin(aggregated.index)].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041126 entries, 6 to 11699854
Data columns (total 18 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   index                1041126 non-null  int64  
 1   id                   1041126 non-null  object 
 2   user_id              1041126 non-null  object 
 3   retweet_count        903874 non-null   object 
 4   reply_count          903763 non-null   object 
 5   favorite_count       832944 non-null   object 
 6   num_hashtags         832871 non-null   object 
 7   num_urls             903390 non-null   object 
 8   num_mentions         903277 non-null   object 
 9   created_at           1041126 non-null  object 
 10  text                 1041126 non-null  object 
 11  retweet_count_conv   1041126 non-null  float64
 12  reply_count_conv     1041126 non-null  float64
 13  favorite_count_conv  1041126 non-null  float64
 14  num_hashtags_conv    1041126 non-null  float64
 1

Everything seems fine, so we rename the 'conv' columns to the original name and drop the uncleaned ones.

In [30]:
dropped = df_tweets.drop(columns=['retweet_count','reply_count','favorite_count','num_hashtags','num_urls','num_mentions','index'])

In [34]:
df_tweets = dropped.rename(columns={'retweet_count_conv':'retweet_count',
                                          'reply_count_conv':'reply_count',
                                          'favorite_count_conv':'favorite_count',
                                          'num_hashtags_conv':'num_hashtags',
                                          'num_urls_conv':'num_urls',
                                          'num_mentions_conv':'num_mentions'})

In [32]:
df_tweets = df_tweets.reset_index()
df_tweets = df_tweets.drop(columns=['index'])

In [35]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10665159 entries, 0 to 11712596
Data columns (total 11 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   id              10665157 non-null  object 
 1   user_id         10447963 non-null  object 
 2   created_at      10665159 non-null  object 
 3   text            10140992 non-null  object 
 4   retweet_count   10450488 non-null  float64
 5   reply_count     10266908 non-null  float64
 6   favorite_count  10395697 non-null  float64
 7   num_hashtags    10041085 non-null  float64
 8   num_urls        10266954 non-null  float64
 9   num_mentions    10088348 non-null  float64
 10  user_id_conv    10665159 non-null  float64
dtypes: float64(7), object(4)
memory usage: 1.2+ GB


## File checkpoint

In [36]:
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_no_dupl_user_text_createdat.csv', sep='#', index=False)

In [37]:
df_tweets = pd.read_csv(DATA_PATH+'tweets_no_dupl_user_text_createdat.csv', sep='#')

# Check how many duplicates we have for each attribute

In [38]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10665159 entries, 0 to 10665158
Data columns (total 11 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   id              10665157 non-null  object 
 1   user_id         10447963 non-null  object 
 2   created_at      10665159 non-null  object 
 3   text            10140992 non-null  object 
 4   retweet_count   10450488 non-null  float64
 5   reply_count     10266908 non-null  float64
 6   favorite_count  10395697 non-null  float64
 7   num_hashtags    10041085 non-null  float64
 8   num_urls        10266954 non-null  float64
 9   num_mentions    10088348 non-null  float64
 10  user_id_conv    10665159 non-null  float64
dtypes: float64(7), object(4)
memory usage: 895.1+ MB


# 1.3 Null user_id and text
We remove the records that have invalid user_id and text, since these records can't be used neither to analyze a user behaviour, neither to perform any kind of topic analysis on the text. The only utility that they can have could be related to the twitter density for a given period of time, but we go for the deletion since 56280 is not a great amount of records. Also, notice that the counters are quite dirty.

In [39]:
df_tweets[df_tweets.user_id.isnull() & df_tweets.text.isnull()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56280 entries, 42 to 10664903
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              56280 non-null  object 
 1   user_id         0 non-null      object 
 2   created_at      56280 non-null  object 
 3   text            0 non-null      object 
 4   retweet_count   48002 non-null  float64
 5   reply_count     23897 non-null  float64
 6   favorite_count  47326 non-null  float64
 7   num_hashtags    360 non-null    float64
 8   num_urls        24230 non-null  float64
 9   num_mentions    429 non-null    float64
 10  user_id_conv    56280 non-null  float64
dtypes: float64(7), object(4)
memory usage: 5.2+ MB


In [40]:
df_tweets=df_tweets.drop(df_tweets.index[df_tweets.user_id.isnull() & df_tweets.text.isnull()])

In [41]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10608879 entries, 0 to 10665158
Data columns (total 11 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   id              10608877 non-null  object 
 1   user_id         10447963 non-null  object 
 2   created_at      10608879 non-null  object 
 3   text            10140992 non-null  object 
 4   retweet_count   10402486 non-null  float64
 5   reply_count     10243011 non-null  float64
 6   favorite_count  10348371 non-null  float64
 7   num_hashtags    10040725 non-null  float64
 8   num_urls        10242724 non-null  float64
 9   num_mentions    10087919 non-null  float64
 10  user_id_conv    10608879 non-null  float64
dtypes: float64(7), object(4)
memory usage: 971.3+ MB


## File Checkpoint

In [42]:
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_no_dupl_1.3.csv', sep='#', index=False)

In [43]:
df_tweets = pd.read_csv(DATA_PATH+'tweets_no_dupl_1.3.csv', sep='#')

# 1.4 Text + created at
We now consider the situation where `text` and `created_at` are equal among different records. Despite is certainly possible that two users post the same text at the same time, the situation is suspect, and we check the possibility that the two records are the same tweet from the same user, but with mispelled `user_id`.
For this analysis we consider only the tweets with non-null text.

In [47]:
duplicates_bool = ~df_tweets.text.isnull() & df_tweets.duplicated(subset=['text','created_at'], keep=False)
duplicates_bool.value_counts()

False    9962254
True      646625
dtype: int64

In [48]:
df_tweets[duplicates_bool].sort_values(by=['text','created_at'])

Unnamed: 0,id,user_id,created_at,text,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,user_id_conv
7038982,203425837784113152,89699349,2017-05-20 10:04:33,!,0.0,0.0,0.0,0.0,0.0,0.0,8.969935e+07
8106319,dehmktdo,ns1uijmf,2017-05-20 10:04:33,!,,,,,0.0,0.0,-1.000000e+00
6015008,461275103569772544,2366417726,2019-05-01 22:45:29,!,0.0,0.0,0.0,0.0,0.0,0.0,2.366418e+09
7651090,461275103464923136,2352555318,2019-05-01 22:45:29,!,0.0,0.0,0.0,0.0,0.0,0.0,2.352555e+09
996155,592045387881455616,2419936286,2020-04-26 19:19:54,!,0.0,0.0,5.0,0.0,0.0,0.0,2.419936e+09
...,...,...,...,...,...,...,...,...,...,...,...
10514097,588710655449964545,2500522076,2020-04-17 14:28:52,ｽﾗｯｼｭｗｗｗｗｗｗｗｗｗｗﾊﾞｯｸｽﾗｯｼｭｗｗｗｗｗﾊﾞｯｸｽﾗｯｼｭｗｗｗｗｗｗｗｗ...,0.0,0.0,1.0,0.0,0.0,0.0,2.500522e+09
3696350,454740993052876,,2020-04-15 09:09:58,ﾃﾞｨｽｺﾌﾗｰｲ,,0.0,,,,,-1.000000e+00
4471874,587905624404398080,1658247169,2020-04-15 09:09:58,ﾃﾞｨｽｺﾌﾗｰｲ,0.0,0.0,0.0,0.0,0.0,0.0,1.658247e+09
3862737,6405695013504003,,2020-03-25 01:00:46,ﾘｱﾆ45000か　いけるかな,1.0,0.0,0.0,,0.0,0.0,-1.000000e+00


In [49]:
df_tweets[duplicates_bool & (df_tweets.user_id_conv == -1)]

Unnamed: 0,id,user_id,created_at,text,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,user_id_conv
83,949116854133810,nviem1q,2018-06-02 03:24:58,"@danloving It’a a rarity, for sure. But for cr...",0.0,0.0,,,,,-1.0
126,744248635197745,wdquew,2019-07-30 18:58:59,@PerfectDKids I'm not a bad kid,,,,,0.0,1.0,-1.0
173,59506645605,,2019-10-02 17:25:04,"I get to go to a lot of overseas places, like ...",0.0,0.0,0.0,,0.0,0.0,-1.0
177,93343051,,2019-09-25 22:43:31,Check out these spooky skull candles! http://...,0.0,0.0,0.0,0.0,1.0,0.0,-1.0
250,135700362264385,uzx,2018-07-28 16:20:17,"""Tutte le persone crudeli si definiscono campi...",0.0,0.0,,0.0,0.0,0.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
10608738,611887654008977075,slbu2,2019-05-23 11:49:51,Vedo il sole e vedo te.... vedo il mare e vedo...,6.0,0.0,9.0,0.0,0.0,,-1.0
10608755,8090168717868031,,2020-03-08 14:22:11,RT @LDdota: WRECKING CREW HERE! @Mineski's Jay...,,0.0,0.0,0.0,0.0,2.0,-1.0
10608792,202576489,q0h,2019-08-20 18:33:34,Io muoro male Tu ciaone proprio Egli lollone N...,0.0,0.0,,,,,-1.0
10608802,33701124988218741,fig6,2020-04-01 00:58:42,Dreams ✨,,0.0,0.0,0.0,0.0,0.0,-1.0


We consider a NaN or alphanumeric `user_id` to be wrong, since the correct format is the one with only numerical characters.
Out of the 646625 duplicates by `[text,created_at]`, 321006 have wrong `user_id`.
We proceed by keeping the copy with the right `user_id`, and selecting the counter attributes by a max operation, as we did at step **1.2**.

In [51]:
for a in ['retweet_count','reply_count','favorite_count','num_hashtags','num_urls','num_mentions']:
    df_tweets[a] = df_tweets[a].replace(np.NaN, -1)

In [52]:
# recreate the index column for the groupby
df_tweets = df_tweets.reset_index()
df_tweets.index = df_tweets['index']

In [54]:
def select_best_user_id(user_ids):
    greater_than_zero = []
    smaller_than_zero = []
    for id in user_ids:
        if id > 0:
            greater_than_zero.append(id)
        else:
            smaller_than_zero.append(id)

    # if all but one are < 0, keep the one that is > 0
    if len(greater_than_zero) == 1:
        return greater_than_zero[0]
    if len(greater_than_zero) > 1:
        return tuple(greater_than_zero)

In [62]:
aggregated = df_tweets[duplicates_bool].groupby(['text','created_at'])\
                                .agg({'retweet_count':max,'reply_count':max,'favorite_count':max,
                                      'num_hashtags':max,'num_urls':max,'num_mentions':max,
                                      'user_id_conv':select_best_user_id,
                                      'index':min}).reset_index()

In [63]:
aggregated.index = aggregated['index']

In [64]:
tweets_with_more_valid_user_ids = aggregated.user_id_conv.apply(type) == float
tweets_with_more_valid_user_ids.value_counts()

True     320882
False      2339
Name: user_id_conv, dtype: int64

In [65]:
aggregated_to_replace = aggregated[tweets_with_more_valid_user_ids]

In [66]:
attributes=['retweet_count','reply_count','favorite_count','num_hashtags','num_mentions','num_urls','user_id_conv']
df_tweets.loc[aggregated_to_replace.index,attributes] = aggregated_to_replace[attributes]

In [67]:
indices_to_drop = duplicates_bool & ~df_tweets.index.isin(aggregated.index)
indices_to_drop.value_counts()

False    10285475
True       323404
dtype: int64

In [69]:
df_tweets.drop(columns=df_tweets.index())

TypeError: 'Int64Index' object is not callable

In [231]:
# check that the non nan copies have been kept
dropped[dropped.index.isin(aggregated.index)].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320882 entries, 25 to 10658976
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   level_0              320882 non-null  int64  
 1   index                320882 non-null  int64  
 2   id                   320882 non-null  object 
 3   user_id              245912 non-null  object 
 4   created_at           320882 non-null  object 
 5   text                 320882 non-null  object 
 6   retweet_count_conv   320882 non-null  float64
 7   reply_count_conv     320882 non-null  float64
 8   favorite_count_conv  320882 non-null  float64
 9   num_hashtags_conv    320882 non-null  float64
 10  num_urls_conv        320882 non-null  float64
 11  num_mentions_conv    320882 non-null  float64
 12  user_id_float        320882 non-null  object 
dtypes: float64(6), int64(2), object(5)
memory usage: 34.3+ MB


Everything seems fine, so we rename the 'conv' columns to the original name and drop the uncleaned ones.

In [233]:
dropped = dropped.drop(columns=['index'])

In [70]:
dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10665159 entries, 0 to 11712596
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   id                   object 
 1   user_id              object 
 2   created_at           object 
 3   text                 object 
 4   retweet_count_conv   float64
 5   reply_count_conv     float64
 6   favorite_count_conv  float64
 7   num_hashtags_conv    float64
 8   num_urls_conv        float64
 9   num_mentions_conv    float64
 10  user_id_conv         float64
dtypes: float64(7), object(4)
memory usage: 1.2+ GB


In [71]:
df_tweets = dropped.rename(columns={'retweet_count_conv':'retweet_count',
                                          'reply_count_conv':'reply_count',
                                          'favorite_count_conv':'favorite_count',
                                          'num_hashtags_conv':'num_hashtags',
                                          'num_urls_conv':'num_urls',
                                          'num_mentions_conv':'num_mentions'})

In [160]:
df_tweets = df_tweets.reset_index()
df_tweets = df_tweets.drop(columns=['index'])

In [161]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10665159 entries, 0 to 10665158
Data columns (total 10 columns):
 #   Column               Non-Null Count     Dtype  
---  ------               --------------     -----  
 0   id                   10665157 non-null  object 
 1   user_id              10447963 non-null  object 
 2   created_at           10665159 non-null  object 
 3   text                 10140992 non-null  object 
 4   retweet_count_conv   10450488 non-null  float64
 5   reply_count_conv     10266908 non-null  float64
 6   favorite_count_conv  10395697 non-null  float64
 7   num_hashtags_conv    10041085 non-null  float64
 8   num_urls_conv        10266954 non-null  float64
 9   num_mentions_conv    10088348 non-null  float64
dtypes: float64(6), object(4)
memory usage: 813.7+ MB


## File checkpoint

In [72]:
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_no_dupl_1.4.csv', sep='#', index=False)

In [73]:
df_tweets = pd.read_csv(DATA_PATH+'tweets_no_dupl_1.4.csv', sep='#')

# 1.5 [user_id,text]

In [151]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10665159 entries, 0 to 10665158
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   index            int64  
 1   id               object 
 2   user_id          object 
 3   created_at       object 
 4   text             object 
 5   retweet_count    float64
 6   reply_count      float64
 7   favorite_count   float64
 8   num_hashtags     float64
 9   num_urls         float64
 10  num_mentions     float64
 11  user_id_conv     float64
 12  created_at_conv  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 1.4+ GB


In [152]:
for attr in ['id',['id','user_id'],['id','created_at'],['id','user_id','created_at'],['user_id','text'],['id','user_id','text']]:
    counts = df_tweets.duplicated(attr).value_counts()
    if len(counts) > 1:
        dupl_count = counts[1]
    else:
        dupl_count = 0
    print(f"{attr} {dupl_count}")

id 31968
['id', 'user_id'] 357
['id', 'created_at'] 0
['id', 'user_id', 'created_at'] 0
['user_id', 'text'] 1485294
['id', 'user_id', 'text'] 259


The duplicates by ['user_id', 'text'] are 1485294, which is a considerable amount and is worth further investigation. It's surely possible that a user, especially if it is a bot, tweets many times the same text. What we want to check is just that among these duplicates all the dates are valid, which is, in a range that goes from the tweeter foundation up to september 2022. If the dates are valid, we keep the tweets. Otherwise it is considered a noisy duplicate and is removed.

In [153]:
twitter_foundation = pd.to_datetime(["20060321"]).astype(np.int64)[0]
sep_2022 = pd.to_datetime(["20220915"]).astype(np.int64)[0]

In [144]:
def wrong_date(date):
    return (date< twitter_foundation) | (date > sep_2022)

In [147]:
df_tweets['created_at_conv'] = pd.to_datetime(df_tweets['created_at']).astype(np.int64)

In [149]:
duplicates_bool = df_tweets.duplicated(['user_id', 'text'], keep=False)
duplicates_bool.value_counts()

False    8527602
True     2137557
dtype: int64

In [154]:
for attr in ['retweet_count','reply_count','favorite_count','num_hashtags','num_mentions','num_urls']:
    df_tweets[attr].replace(-1, np.NaN)

df_tweets[duplicates_bool][wrong_date(df_tweets.created_at_conv)].info(show_counts=True)

  df_tweets[duplicates_bool][wrong_date(df_tweets.created_at_conv)].info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 97605 entries, 223 to 10665149
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            97605 non-null  int64  
 1   id               97605 non-null  object 
 2   user_id          97605 non-null  object 
 3   created_at       97605 non-null  object 
 4   text             97239 non-null  object 
 5   retweet_count    97605 non-null  float64
 6   reply_count      97605 non-null  float64
 7   favorite_count   97605 non-null  float64
 8   num_hashtags     97605 non-null  float64
 9   num_urls         97605 non-null  float64
 10  num_mentions     97605 non-null  float64
 11  user_id_conv     97605 non-null  float64
 12  created_at_conv  97605 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 10.4+ MB


We will not aggregate the counters using a max operator as in previous steps, since the multiple "copies" in this case could be just a periodic tweet. Furthermore, there is no null value in the counters.
Given that there is no null value in the other fields, we will simply remove the records having wrong date.

In [160]:
df_tweets[duplicates_bool].created_at_conv.apply(wrong_date).value_counts()

False    2039952
True       97605
Name: created_at_conv, dtype: int64

In these duplicates 97605 records have wrong created_at.

In [172]:
df_tweets = df_tweets[~(duplicates_bool & df_tweets[duplicates_bool].created_at_conv.apply(wrong_date))]

## File checkpoint

In [173]:
df_tweets = df_tweets.drop(columns='index').reset_index()
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_no_dupl_1.5.csv', sep='#', index=False)

In [174]:
df_tweets = pd.read_csv(DATA_PATH+'tweets_no_dupl_1.5.csv', sep='#')

# 1.6 [user_id,created_at]

In [175]:
for attr in ['id',['id','user_id'],['id','created_at'],['user_id', 'created_at'],['created_at','text'],['id','user_id','created_at'],['user_id','text'],['id','user_id','text']]:
    counts = df_tweets.duplicated(attr).value_counts()
    if len(counts) > 1:
        dupl_count = counts[1]
    else:
        dupl_count = 0
    print(f"{attr} {dupl_count}")

id 31949
['id', 'user_id'] 357
['id', 'created_at'] 0
['user_id', 'created_at'] 404981
['created_at', 'text'] 331351
['id', 'user_id', 'created_at'] 0
['user_id', 'text'] 1387689
['id', 'user_id', 'text'] 259


In [181]:
duplicates_bool = df_tweets.duplicated(['user_id', 'created_at'], keep=False) & ~df_tweets.user_id.isnull()
duplicates_bool.value_counts()

False    9773243
True      794311
dtype: int64

The duplicates that have non-null text also have non-null counters

In [187]:
df_tweets[duplicates_bool & ~df_tweets.text.isnull()].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421231 entries, 2 to 10567552
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   index            421231 non-null  int64  
 1   id               421231 non-null  object 
 2   user_id          421231 non-null  object 
 3   created_at       421231 non-null  object 
 4   text             421231 non-null  object 
 5   retweet_count    421231 non-null  float64
 6   reply_count      421231 non-null  float64
 7   favorite_count   421231 non-null  float64
 8   num_hashtags     421231 non-null  float64
 9   num_urls         421231 non-null  float64
 10  num_mentions     421231 non-null  float64
 11  user_id_conv     421231 non-null  float64
 12  created_at_conv  421231 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 45.0+ MB


In [188]:
df_tweets[duplicates_bool & df_tweets.text.isnull()].info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 373080 entries, 32 to 10567551
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   index            373080 non-null  int64  
 1   id               373078 non-null  object 
 2   user_id          373080 non-null  object 
 3   created_at       373080 non-null  object 
 4   text             0 non-null       object 
 5   retweet_count    266383 non-null  float64
 6   reply_count      131293 non-null  float64
 7   favorite_count   259781 non-null  float64
 8   num_hashtags     56 non-null      float64
 9   num_urls         131063 non-null  float64
 10  num_mentions     57 non-null      float64
 11  user_id_conv     373080 non-null  float64
 12  created_at_conv  373080 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 39.8+ MB


In [51]:
for a in ['retweet_count','reply_count','favorite_count','num_hashtags','num_urls','num_mentions']:
    df_tweets[a] = df_tweets[a].replace(np.NaN, -1)

In [214]:
def select_non_null_text(texts):
    nonnull_texts = []
    for text in texts:
        if ~pd.Series(text).isnull()[0]:
            nonnull_texts.append(text)
    if len(nonnull_texts) == 1:
        return nonnull_texts[0]
    elif len(nonnull_texts) == 0:
        return np.NaN
    elif len(nonnull_texts) > 1:
        return nonnull_texts

In [227]:
df_tweets = df_tweets.drop(columns='index').reset_index()

In [228]:
aggregated = df_tweets[duplicates_bool].groupby(['user_id', 'created_at'])\
                                .agg({'retweet_count':max,'reply_count':max,'favorite_count':max,
                                      'num_hashtags':max,'num_urls':max,'num_mentions':max,
                                      'text':select_non_null_text,
                                      'index':min}).reset_index()

In [229]:
aggregated.index = aggregated['index']

In [230]:
aggregated

Unnamed: 0_level_0,user_id,created_at,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,text,index
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1874420,100043740,2019-04-13 00:53:45,5.0,0.0,8.0,0.0,0.0,1.0,Monacan Girls Soccer falls 3-0 to Midlothian a...,1874420
24836,100043740,2019-04-15 00:18:50,0.0,0.0,0.0,0.0,0.0,1.0,@Natty032,24836
982508,100043740,2019-04-18 21:20:51,1.0,0.0,0.0,0.0,0.0,1.0,RT @The_Messiah_57: Neymar tricked it,982508
3174385,100043740,2019-04-18 23:04:51,0.0,0.0,1.0,0.0,0.0,1.0,@THE_thompson5 it's what I would envision you ...,3174385
4396249,100043740,2019-04-21 22:07:28,0.0,0.0,0.0,0.0,0.0,1.0,@NygelLee_20 could say the same about KD could...,4396249
...,...,...,...,...,...,...,...,...,...,...
712900,999662546,2020-03-23 05:49:37,2.0,0.0,0.0,0.0,0.0,1.0,RT @laurnaylor: FAB FILS ARE NATIONAL CHAMPS Y...,712900
3356641,999662546,2020-03-23 05:51:07,25.0,0.0,0.0,0.0,0.0,1.0,RT @tristankennedy8: NATIONAL CHAMPIONS,3356641
6385884,999662546,2020-03-23 05:53:07,18.0,0.0,0.0,1.0,0.0,1.0,RT @klrenfro: So proud of all they accomplishe...,6385884
375665,999662546,2020-03-30 20:48:54,1.0,0.0,1.0,0.0,0.0,1.0,@brianandrews43 so proud of you!,375665


In [231]:
aggregated['text_type'] = aggregated.text.apply(type)
aggregated.text_type.value_counts()

<class 'str'>     371387
<class 'list'>     18981
Name: text_type, dtype: int64

In [232]:
aggregated_to_replace = aggregated[aggregated.text_type == str]

In [233]:
attributes=['retweet_count','reply_count','favorite_count','num_hashtags','num_mentions','num_urls','text']
df_tweets.loc[aggregated_to_replace.index,attributes] = aggregated_to_replace[attributes]

In [237]:
indices_to_drop = duplicates_bool & df_tweets.index.isin(aggregated_to_replace.index)
indices_to_drop.value_counts()

False    10196167
True       371387
dtype: int64

In [238]:
df_tweets = df_tweets[~indices_to_drop]

In [240]:
df_tweets = df_tweets.reset_index()
df_tweets = df_tweets.drop(columns=['index'])

In [241]:
df_tweets.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10196167 entries, 0 to 10196166
Data columns (total 13 columns):
 #   Column           Non-Null Count     Dtype  
---  ------           --------------     -----  
 0   level_0          10196167 non-null  int64  
 1   id               10196165 non-null  object 
 2   user_id          9978971 non-null   object 
 3   created_at       10196167 non-null  object 
 4   text             9846437 non-null   object 
 5   retweet_count    10031442 non-null  float64
 6   reply_count      9910585 non-null   float64
 7   favorite_count   9979626 non-null   float64
 8   num_hashtags     9746128 non-null   float64
 9   num_urls         9910903 non-null   float64
 10  num_mentions     9793390 non-null   float64
 11  user_id_conv     10196167 non-null  float64
 12  created_at_conv  10196167 non-null  int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 1011.3+ MB


## File checkpoint

In [None]:
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_no_dupl_1.6.csv', sep='#', index=False)

In [None]:
df_tweets = pd.read_csv(DATA_PATH+'tweets_no_dupl_1.6.csv', sep='#')

# Last checks

In [None]:
for attr in ['id', ['id', 'user_id'], ['id', 'created_at'], ['user_id', 'created_at'], ['created_at', 'text'],
             ['id', 'user_id', 'created_at'], ['user_id', 'text'], ['id', 'user_id', 'text']]:
    counts = df_tweets.duplicated(attr).value_counts()
    if len(counts) > 1:
        dupl_count = counts[1]
    else:
        dupl_count = 0
    print(f"{attr} {dupl_count}")

# 1.5 bis - userful for further studies on the user behaviour

In [162]:
def select_best_created_at(dates):
    correct=[]
    wrong=[]

    for date in dates:
        if date<sep_2022 and date>=twitter_foundation:
            correct.append(date)
        else:
            wrong.append(date)

    if len(correct)  == 0:
        # we have no correct candidate, so do not perform any substitution. We will deal with this record in the data cleaning step.
        return []
    if len(correct)  == 1:
        # we have only one correct candidate, so we can merge the duplicates into one.
        return correct[0]
    if len(correct)  > 1:
        # we return all the candidates.
        return correct

def len_with_int(x):
    try:
        return len(x)
    except TypeError:
        return 1

In [163]:
aggregated = df_tweets[duplicates_bool].groupby(['user_id','text']).agg({'created_at_conv':select_best_created_at,'index':list}).reset_index()

In [None]:
aggregated['created_at_candidates'] = aggregated.created_at_conv.apply(len_with_int)
aggregated['n_copies'] = aggregated['index'].apply(len_with_int)

The records that we are not able to correct are the one that:
1) the ones that have no correct candidate, so `created_at_candidates == 0`
2) the ones that have more correct candidate, so `(created_at_candidates) > 1 & (n_copies > created_at_candidates)`
There is no record in the first category.
For the second category we have a total of 2013 copies for which the date is not decidable. We will simply remove these records because they is an high probability that they are just noise.

In [None]:
# 1
(aggregated.created_at_candidates == 0).value_counts()

In [None]:
# 2
undecidable = (aggregated.created_at_candidates>1) & (aggregated.n_copies > aggregated.created_at_candidates)
aggregated[undecidable].n_copies.sum() - aggregated[undecidable].created_at_candidates.sum()

We now proceed with deletion of the wrong copies.

In [67]:
indices_to_drop = duplicates_bool & ~df_tweets.index.isin(aggregated.index)
indices_to_drop.value_counts()

False    10285475
True       323404
dtype: int64