In [None]:
import pandas as pd
import math
import numpy as np
from matplotlib import pyplot as plt
from numpy import logical_and as land, logical_or as lor
import re
from utilities import plot_boxplot, plot_hist

In [None]:
DATA_PATH = 'G:/Shared drives/DM_tweets/data/'

In [None]:
DATA_PATH = 'G:/Drive condivisi/DM_tweets/data/'

In [None]:
df_tweets = pd.read_csv(DATA_PATH+'tweets_clean.csv', sep='#')
df_users = pd.read_csv(DATA_PATH+'users_clean.csv', sep='#')

# How many tweets were published by the user?

In [None]:
df_indicators = pd.DataFrame({'n_tweets' : df_tweets.groupby(['user_id'] ).size()}).reset_index()

In [None]:
df_indicators

# How many tweets are published by the user in a given period of time?

In [None]:
for y in range(2012, 2021):
  df_indicators[str(y) + "_tweets"] = 0

df_indicators.info()

In [None]:
for y in range(2012,2021):
  # Filter all tweets published in a certain year
  tweets_by_y = df_tweets[land(df_tweets['created_at'] >= (str(y)+"-01-01"), 
                               df_tweets['created_at'] <= (str(y)+"-12-31"))][['user_id','created_at']]

  # Count tweets of users in that year
  gby = pd.DataFrame({'n_tweets' : tweets_by_y.groupby( [ 'user_id'] ).size()}).reset_index()
  
  # Transform it in a dictionary
  gby = dict(zip(gby.user_id, gby.n_tweets))

  # Select users to modify and update them
  df_indicators.loc[df_indicators['user_id'].isin(gby.keys()), str(y)+'_tweets'] = list(gby.values())

In [None]:
years_col_list = [str(y)+'_tweets' for y in range(2012,2021)]
sum_list = [df_indicators[y].sum() for y in years_col_list]
plt.bar(range(2012,2021), sum_list, log=True)

# Total number of tweets

In [None]:
tot_n_tweets = df_tweets.shape[0]
tot_n_tweets

# Total number of likes and comments and retweets

In [None]:
total_favorite = df_tweets.favorite_count.sum()
total_favorite

In [None]:
total_replies = df_tweets.reply_count.sum()
total_replies

In [None]:
total_retweets = df_tweets.retweet_count.sum()
total_retweets

# Ratio between the number of tweets and the number of likes

In [None]:
ratio_tweets_fav = len(df_tweets) / total_favorite
ratio_tweets_fav

## Ratio between the number of tweets and the number of retweets

In [None]:
ratio_tweets_rt = len(df_tweets) / total_retweets
ratio_tweets_rt

## Ratio between the number of tweets and the number of comments

In [None]:
ratio_tweets_replies = len(df_tweets) / total_replies
ratio_tweets_replies

# Average length of the tweets per user

In [None]:
df_tweets['length'] = df_tweets.text.astype(str).apply(len)

In [None]:
df_tweets.boxplot('length')

In [None]:
len_groupby = df_tweets.groupby('user_id').agg({'length':'mean'})

In [None]:
df_indicators['mean_length'] = len_groupby.length.values
df_indicators.info()

In [None]:
df_indicators.max()

# Average number of special characters in the tweets per user

In [None]:
df_tweets['special_chars'] = df_tweets.text.astype(str).apply(lambda text: len(re.findall("(?=\W)(?=\S)", text)))

In [None]:
len_groupby = df_tweets.groupby('user_id').agg({'special_chars':'mean'})

df_indicators['mean_special_chars'] = len_groupby.special_chars.values
df_indicators

# File Checkpoint

In [None]:
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_with_indicators.csv', sep='#', index=False)

In [None]:
df_indicators.to_csv(path_or_buf=DATA_PATH+'indicators_1.csv', sep='#', index=False)

In [None]:
df_indicators = pd.read_csv(DATA_PATH+'indicators_1.csv', sep='#')

# Mean, standard deviation and entropy for each attribute by user

In [None]:
user_groupby = df_tweets.groupby('user_id').agg(['mean','std',entropy])

In [None]:
user_groupby.drop(columns=['length','special_chars'], inplace=True)

In [None]:
colz = list(user_groupby.columns.map('_'.join).str.strip('_'))

In [None]:
df_indicators[colz] = user_groupby.values

In [None]:
df_indicators.max()

# Tweeting regularity
We want an indicator that can tell us if the user has a certain regularity in the publication.
We define the tweeting regularity of a user as:
$$ Var( \{\ timestamp_i - timestamp_j\ |\ j = i + 1 \} )$$

In [86]:
def tweeting_regularity(user_timestamps):

    user_timestamps = user_timestamps.values

    if(len(user_timestamps) == 1):
        return 0

    user_timestamps.sort()
    diffs = []
    for i in range(1,len(user_timestamps)):
        diffs.append(user_timestamps[i]-user_timestamps[i-1])
    diffs = np.array(diffs)

    return entropy(diffs)

df_indicators['tweeting_regularity'] = df_tweets.groupby('user_id').agg({'created_at_conv': tweeting_regularity}).values

# Maximum number of tweets in a day by user

Add new column which represents the date as the number of the day in the year (e.g. 1st of Janueary would be 1, 2nd of January would be 2, [...], 31st of December would be 365 or 366) concatenated to the year.
For example, 2nd of January of 2019 would be "2_2019".

In [87]:
df_tweets['day_of_year'] = pd.DatetimeIndex(df_tweets.created_at).day_of_year.map(str) + "_" + pd.DatetimeIndex(df_tweets.created_at).year.map(str)

In [88]:
df_tweets['day_of_year']

0           254_2019
1            92_2020
2           122_2019
3           308_2019
4            71_2020
              ...   
10187868    116_2019
10187869    108_2020
10187870    192_2016
10187871     66_2019
10187872    191_2019
Name: day_of_year, Length: 10187873, dtype: object

In [89]:
user_date_groupby = pd.DataFrame({'n_tweets_by_day' : df_tweets.groupby( [ 'user_id', 'day_of_year'] ).size()}).reset_index()

In [34]:
user_date_groupby

Unnamed: 0,user_id,day_of_year,n_tweets_by_day
0,000vk,118_2020,1
1,0013tom,99_2018,1
2,001gi,98_2020,1
3,001rx,70_2020,1
4,00201f,43_2019,1
...,...,...,...
1151381,zzyb,306_2019,1
1151382,zzyd6,170_2019,1
1151383,zzyyu,88_2020,1
1151384,zzz9v,106_2019,1


In [35]:
# Maximum number of tweets in a day
user_date_groupby[user_date_groupby['n_tweets_by_day'] == user_date_groupby['n_tweets_by_day'].max()]

Unnamed: 0,user_id,day_of_year,n_tweets_by_day
778142,492649414,115_2020,3238


Identify maximum number of tweets posted in a day for each user.

In [90]:
df_indicators['max_daily_tweets'] = user_date_groupby.groupby(by=['user_id']).agg({'n_tweets_by_day':max}).values

# File Checkpoint

In [37]:
df_tweets.to_csv(path_or_buf=DATA_PATH+'tweets_with_indicators.csv', sep='#', index=False)

In [38]:
df_indicators.to_csv(path_or_buf=DATA_PATH+'indicators.csv', sep='#', index=False)