In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from datetime import datetime 
from collections import defaultdict
from scipy.stats import pearsonr

In [2]:
#create a pandas dataframes from csv files
tweets = pd.read_csv('tweets.csv')

## Tweets Analysis

## Trasformazione delle colonne in tipi

In [3]:
tweets['id'] = pd.to_numeric(tweets['id'], errors='coerce')
tweets['user_id'] = pd.to_numeric(tweets['user_id'], errors='coerce')
tweets['retweet_count'] = pd.to_numeric(tweets['retweet_count'], errors='coerce')
tweets['reply_count'] = pd.to_numeric(tweets['reply_count'], errors='coerce')
tweets['favorite_count'] = pd.to_numeric(tweets['favorite_count'], errors='coerce')
tweets['num_hashtags'] = pd.to_numeric(tweets['num_hashtags'], errors='coerce')
tweets['num_mentions'] = pd.to_numeric(tweets['num_mentions'], errors='coerce')
tweets['num_urls'] = pd.to_numeric(tweets['num_urls'], errors='coerce')
tweets['created_at'] = pd.to_datetime(tweets['created_at'], errors='coerce')

## Eliminazione dei valori negativi

In [4]:
tweets['retweet_count'] = tweets['retweet_count'].abs()
tweets['reply_count'] = tweets['reply_count'].abs()
tweets['favorite_count'] = tweets['favorite_count'].abs()
tweets['num_hashtags'] = tweets['num_hashtags'].abs()
tweets['num_mentions'] = tweets['num_mentions'].abs()
tweets['num_urls'] = tweets['num_urls'].abs()

## Eliminazione dei valori 'inf'

In [5]:
tweets.replace('inf', np.nan, inplace=True)
tweets.replace(np.inf, np.nan, inplace=True)

tweets.replace('inf', math.nan, inplace=True)
tweets.replace(math.inf, math.nan, inplace=True)

### Drop duplicates

In [6]:
count = tweets['id'].count()
tweets.drop_duplicates(inplace=True)
print("Dropped " + str(count - tweets['id'].count()) + " duplicates")

Dropped 1952099 duplicates


### Replace NaN values in 'id'

Replace nan values with a value (-1) that indicates that this information is missing

In [7]:
tweets.loc[tweets['id'].isna(), 'id'] = -1

Check if there are some id duplicated

In [8]:
mask = tweets.duplicated('id', keep=False)
duplicated_id = tweets[(mask) & (tweets['id'] != -1)]
duplicated_id.sort_values('id').head(10)

Unnamed: 0,id,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
8592560,218442.0,466317600.0,0.0,,0.0,,,,2019-09-05 06:37:24,
3498751,218442.0,2380298000.0,0.0,0.0,0.0,,,,2019-05-03 09:50:40,
7967612,261723.0,2877875000.0,285.0,0.0,,0.0,,1.0,2020-01-11 07:55:12,"RT @LOLGOP: For the cost of the Iraq War, we c..."
10205294,261723.0,467060700.0,,0.0,,1.0,0.0,0.0,2019-09-17 19:36:23,In paradiso non c'e' sesso. Approfittatene ade...
13085390,457781.0,,,,,,,,2020-04-28 15:05:38,@twistynipple did they
10408486,457781.0,,,,,,,,2020-01-29 02:53:42,@noraxxalien stupid
12205716,558867.0,47161970.0,,,0.0,,0.0,,2017-05-17 22:32:44,
10807050,558867.0,,0.0,,,,0.0,,2019-04-05 03:10:53,
805194,785227.0,708043500.0,0.0,0.0,,,0.0,0.0,2020-02-06 11:57:16,@MichaelbMedlin HA u got me
13549631,785227.0,,590.0,0.0,0.0,0.0,1.0,,2019-06-22 16:30:35,RT @zullylully: You raised your daughter right


Assign -1 to all the id duplicated

In [9]:
tweets.loc[(mask) & (tweets['id'] != -1), 'id'] = -1

### Replace NaN values in user_id

In [None]:
tweets.loc[tweets['user_id'].isna(), 'user_id'] = -1

### Infer num_hashtags, num_urls, num_mentions

In [10]:
tweets['infer_hashtags'] = tweets.text.str.count('#')
tweets['infer_mentions'] = tweets.text.str.count('@')
tweets['infer_urls'] = tweets.text.str.count('http://')

print("num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      " num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      " num_urls nan: " + str(tweets['num_mentions'].isna().sum()))

tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['infer_hashtags'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['infer_mentions'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['infer_urls'])

print("num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      " num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      " num_urls nan: " + str(tweets['num_mentions'].isna().sum()))

tweets.drop(columns=['infer_hashtags', 'infer_mentions', 'infer_urls'], inplace=True)

num_hashtags nan: 1163675 num_mentions nan: 987988 num_urls nan: 987988
num_hashtags nan: 489183 num_mentions nan: 488513 num_urls nan: 488513


### Replace NaN values in tweets

In [11]:
t = tweets
user_median = t.groupby('user_id', as_index=False).median()
user_median.rename(columns={
    'retweet_count' : 'retweet_median',
    'reply_count' : 'reply_median',
    'favorite_count' : 'favorite_median',
    'num_hashtags' : 'hashtags_median',
    'num_urls' : 'urls_median',
    'num_mentions' : 'mentions_median'
}, inplace=True)
user_median.drop(columns=['id'], inplace=True)
tweets = t.merge(user_median, on='user_id')

print("retweet_count nan: " + str(tweets['retweet_count'].isna().sum()) + \
      ", reply_count nan: " + str(tweets['reply_count'].isna().sum()) + \
      ", favorite_count nan: " + str(tweets['favorite_count'].isna().sum()) + \
      ", num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      ", num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      ", num_urls nan: " + str(tweets['num_urls'].isna().sum()))

#sostituzione valori
tweets['retweet_count'] = tweets['retweet_count'].fillna(tweets['retweet_median'])
tweets['reply_count'] = tweets['reply_count'].fillna(tweets['reply_median'])
tweets['favorite_count'] = tweets['favorite_count'].fillna(tweets['favorite_median'])
tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['hashtags_median'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['mentions_median'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['urls_median'])

tweets.drop(columns=['retweet_median', 'reply_median', 'favorite_median', 'hashtags_median', 'mentions_median', 'urls_median'], \
            inplace=True)

print("retweet_count nan: " + str(tweets['retweet_count'].isna().sum()) + \
      ", reply_count nan: " + str(tweets['reply_count'].isna().sum()) + \
      ", favorite_count nan: " + str(tweets['favorite_count'].isna().sum()) + \
      ", num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      ", num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      ", num_urls nan: " + str(tweets['num_urls'].isna().sum()))


retweet_count nan: 518685, reply_count nan: 654510, favorite_count nan: 653433, num_hashtags nan: 377024, num_mentions nan: 376488, num_urls nan: 244865
retweet_count nan: 158, reply_count nan: 257, favorite_count nan: 247, num_hashtags nan: 170, num_mentions nan: 170, num_urls nan: 88


In [12]:
tweets['retweet_count'].replace(math.nan, tweets['retweet_count'].median(), inplace=True)
tweets['reply_count'].replace(math.nan, tweets['reply_count'].median(), inplace=True)
tweets['favorite_count'].replace(math.nan, tweets['favorite_count'].median(), inplace=True)
tweets['num_hashtags'].replace(math.nan, tweets['num_hashtags'].median(), inplace=True)
tweets['num_mentions'].replace(math.nan, tweets['num_mentions'].median(), inplace=True)
tweets['num_urls'].replace(math.nan, tweets['num_urls'].median(), inplace=True)

### Replacing invalid timestamp

In [13]:
tweets.loc[(tweets.created_at < datetime.strptime("2006-07-15 00:00:00", "%Y-%m-%d %H:%M:%S")) | (tweets.created_at > datetime.now()),\
     'created_at'] = datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")
tweets[tweets['created_at'] == datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")]['id'].count()

97605

### Number of special char

In [14]:
special = '[(|!£$%&/=?^@#§,.;:-_<>ç@)]' # Define special characters
tweets['special'] = tweets['text'].str.count(special) # Count them
tweets['special'] = tweets['special'].fillna(0)
tweets['special'] = tweets['special'].astype(int)


### Lenght

In [15]:
tweets['text_lenght'] = tweets['text'].str.len()
tweets['text_lenght'] = tweets['text_lenght'].fillna(0)
tweets['text_lenght'] = tweets['text_lenght'].astype(int)


### Outliers

In [16]:
tweets.loc[(tweets['retweet_count'] > tweets['retweet_count'].quantile(.99))] = 0
tweets.loc[(tweets['reply_count'] > tweets['reply_count'].quantile(.99)) ] = 0
tweets.loc[(tweets['favorite_count'] > tweets['favorite_count'].quantile(.99))] = 0
tweets.loc[(tweets['num_hashtags'] > tweets['num_hashtags'].quantile(.99)) ] = 0
tweets.loc[(tweets['num_mentions'] > tweets['num_mentions'].quantile(.99))] = 0
tweets.loc[(tweets['num_urls'] > tweets['num_urls'].quantile(.99))] = 0

#tweets.loc[(tweets['text_lenght'] > tweets['text_lenght'].quantile(.95))] = 0
#tweets.loc[(tweets['special'] > tweets['special'].quantile(.95))] = 0
        

In [17]:
tweets['retweet_count'].max()

7014.0

In [18]:
tweets.to_csv('new_tweets.csv')