In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from datetime import datetime 
from collections import defaultdict
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Tweets data preparation

In [2]:
tweets = pd.read_csv('tweets.csv')

In [3]:
tweets['id'] = pd.to_numeric(tweets['id'], errors='coerce')
tweets['user_id'] = pd.to_numeric(tweets['user_id'], errors='coerce')
tweets['retweet_count'] = pd.to_numeric(tweets['retweet_count'], errors='coerce')
tweets['reply_count'] = pd.to_numeric(tweets['reply_count'], errors='coerce')
tweets['favorite_count'] = pd.to_numeric(tweets['favorite_count'], errors='coerce')
tweets['num_hashtags'] = pd.to_numeric(tweets['num_hashtags'], errors='coerce')
tweets['num_mentions'] = pd.to_numeric(tweets['num_mentions'], errors='coerce')
tweets['num_urls'] = pd.to_numeric(tweets['num_urls'], errors='coerce')
tweets['created_at'] = pd.to_datetime(tweets['created_at'], errors='coerce')

## Eliminazione dei valori negativi

In [4]:
tweets['retweet_count'] = tweets['retweet_count'].abs()
tweets['reply_count'] = tweets['reply_count'].abs()
tweets['favorite_count'] = tweets['favorite_count'].abs()
tweets['num_hashtags'] = tweets['num_hashtags'].abs()
tweets['num_mentions'] = tweets['num_mentions'].abs()
tweets['num_urls'] = tweets['num_urls'].abs()

## Eliminazione dei valori inf

In [5]:
tweets.replace(math.inf, math.nan, inplace=True)

## Manage duplicates:

### Dropping duplicates:

In [6]:
tweets.drop_duplicates(inplace=True)

### Find and replace duplicated 'id'

In [7]:
mask = tweets.duplicated('id', keep=False)
tweets.loc[mask, 'id'] = -1

## Replacing NaN values:

### Replace NaN values in 'id'

Replace nan values with a value (-1) that indicates that this information is missing

In [8]:
tweets.loc[tweets['id'].isna(), 'id'] = -1

### Replacing NaN values in 'num_hashtags', 'num_urls' and 'num_mentions' by infer from the tweet's text

In [9]:
tweets['infer_hashtags'] = tweets.text.str.count('#')
tweets['infer_mentions'] = tweets.text.str.count('@')
tweets['infer_urls'] = tweets.text.str.count('http://')

tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['infer_hashtags'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['infer_mentions'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['infer_urls'])

tweets.drop(columns=['infer_hashtags', 'infer_mentions', 'infer_urls'], inplace=True)

### Replacing NaN values with the median of the user

In [10]:
t = tweets
user_median = t.groupby('user_id', as_index=False).median()
user_median.rename(columns={
    'retweet_count' : 'retweet_median',
    'reply_count' : 'reply_median',
    'favorite_count' : 'favorite_median',
    'num_hashtags' : 'hashtags_median',
    'num_urls' : 'urls_median',
    'num_mentions' : 'mentions_median'
}, inplace=True)
user_median.drop(columns=['id'], inplace=True)
tweets = t.merge(user_median, on='user_id')

#sostituzione valori
tweets['retweet_count'] = tweets['retweet_count'].fillna(tweets['retweet_median'])
tweets['reply_count'] = tweets['reply_count'].fillna(tweets['reply_median'])
tweets['favorite_count'] = tweets['favorite_count'].fillna(tweets['favorite_median'])
tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['hashtags_median'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['mentions_median'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['urls_median'])

tweets.drop(columns=['retweet_median', 'reply_median', 'favorite_median', 'hashtags_median', 'mentions_median', 'urls_median'], \
            inplace=True)


### Fill the remained NaN values with the median of the attribute

In [11]:
tweets['retweet_count'].replace(math.nan, tweets['retweet_count'].median(), inplace=True)
tweets['reply_count'].replace(math.nan, tweets['reply_count'].median(), inplace=True)
tweets['favorite_count'].replace(math.nan, tweets['favorite_count'].median(), inplace=True)
tweets['num_hashtags'].replace(math.nan, tweets['num_hashtags'].median(), inplace=True)
tweets['num_mentions'].replace(math.nan, tweets['num_mentions'].median(), inplace=True)
tweets['num_urls'].replace(math.nan, tweets['num_urls'].median(), inplace=True)

## Replacing invalid 'created_at' with symbolic date (01/01/2000)

In [12]:
tweets.loc[(tweets.created_at < datetime.strptime("2006-07-15 00:00:00", "%Y-%m-%d %H:%M:%S")) | \
    (tweets.created_at > datetime.now()),'created_at'] = datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")

## Manage outliers

In [13]:
tweets.loc[tweets['retweet_count'] > tweets['retweet_count'].quantile(.99), 'retweet_count'] = 0
tweets.loc[tweets['reply_count'] > tweets['reply_count'].quantile(.99), 'reply_count'] = 0
tweets.loc[tweets['favorite_count'] > tweets['favorite_count'].quantile(.99), 'favorite_count'] = 0
tweets.loc[tweets['num_hashtags'] > tweets['num_hashtags'].quantile(.99), 'num_hashtags'] = 0
tweets.loc[tweets['num_mentions'] > tweets['num_mentions'].quantile(.99), 'num_mentions'] = 0
tweets.loc[tweets['num_urls'] > tweets['num_urls'].quantile(.99), 'num_urls'] = 0

## Tweets indicators

### Number of special character:

In [14]:
special = '[(|!£$%&/=?^@#§,.;:-_<>ç@)]' # Define special characters
tweets['special'] = tweets['text'].str.count(special) # Count them
tweets['special'] = tweets['special'].fillna(0)
tweets['special'] = tweets['special'].astype(int)

### Text lenght:

In [15]:
tweets['text_lenght'] = tweets['text'].str.len()
tweets['text_lenght'] = tweets['text_lenght'].fillna(0)
tweets['text_lenght'] = tweets['text_lenght'].astype(int)

## Write a new file '.csv' with all the modify done

In [16]:
tweets.to_csv('new_tweets.csv')

# User Preparation

This part use the previous modify done to the tweets dataframe.

In [17]:
users = pd.read_csv('users.csv')

In [18]:
users['id'] = pd.to_numeric(users['id'], errors='coerce')
users['statuses_count'] = pd.to_numeric(users['statuses_count'], errors='coerce')
users['created_at'] = pd.to_datetime(users['created_at'], errors='coerce')