In [None]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime 
from collections import defaultdict
from scipy.stats import pearsonr

# Tweets data preparation

In [None]:
tweets = pd.read_csv('tweets.csv')

In [None]:
tweets['id'] = pd.to_numeric(tweets['id'], errors='coerce')
tweets['user_id'] = pd.to_numeric(tweets['user_id'], errors='coerce')
tweets['retweet_count'] = pd.to_numeric(tweets['retweet_count'], errors='coerce')
tweets['reply_count'] = pd.to_numeric(tweets['reply_count'], errors='coerce')
tweets['favorite_count'] = pd.to_numeric(tweets['favorite_count'], errors='coerce')
tweets['num_hashtags'] = pd.to_numeric(tweets['num_hashtags'], errors='coerce')
tweets['num_mentions'] = pd.to_numeric(tweets['num_mentions'], errors='coerce')
tweets['num_urls'] = pd.to_numeric(tweets['num_urls'], errors='coerce')
tweets['created_at'] = pd.to_datetime(tweets['created_at'], errors='coerce')

## Eliminazione dei valori negativi

In [None]:
tweets['retweet_count'] = tweets['retweet_count'].abs()
tweets['reply_count'] = tweets['reply_count'].abs()
tweets['favorite_count'] = tweets['favorite_count'].abs()
tweets['num_hashtags'] = tweets['num_hashtags'].abs()
tweets['num_mentions'] = tweets['num_mentions'].abs()
tweets['num_urls'] = tweets['num_urls'].abs()

## Eliminazione dei valori inf

In [None]:
tweets.replace(math.inf, math.nan, inplace=True)

## Manage duplicates:

### Dropping duplicates:

In [None]:
tweets.drop_duplicates(inplace=True)

### Find and replace duplicated 'id'

In [None]:
mask = tweets.duplicated('id', keep=False)
tweets.loc[mask, 'id'] = -1

## Replacing NaN values:

### Replace NaN values in 'id'

Replace nan values with a value (-1) that indicates that this information is missing

In [None]:
tweets.loc[tweets['id'].isna(), 'id'] = -1

### Replacing NaN values in 'num_hashtags', 'num_urls' and 'num_mentions' by infer from the tweet's text

In [None]:
tweets['infer_hashtags'] = tweets.text.str.count('#')
tweets['infer_mentions'] = tweets.text.str.count('@')
tweets['infer_urls'] = tweets.text.str.count('http://')

tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['infer_hashtags'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['infer_mentions'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['infer_urls'])

tweets.drop(columns=['infer_hashtags', 'infer_mentions', 'infer_urls'], inplace=True)

### Replacing NaN values with the median of the user

In [None]:
t = tweets
user_median = t.groupby('user_id', as_index=False).median()
user_median.rename(columns={
    'retweet_count' : 'retweet_median',
    'reply_count' : 'reply_median',
    'favorite_count' : 'favorite_median',
    'num_hashtags' : 'hashtags_median',
    'num_urls' : 'urls_median',
    'num_mentions' : 'mentions_median'
}, inplace=True)
user_median.drop(columns=['id'], inplace=True)
tweets = t.merge(user_median, on='user_id')

#sostituzione valori
tweets['retweet_count'] = tweets['retweet_count'].fillna(tweets['retweet_median'])
tweets['reply_count'] = tweets['reply_count'].fillna(tweets['reply_median'])
tweets['favorite_count'] = tweets['favorite_count'].fillna(tweets['favorite_median'])
tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['hashtags_median'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['mentions_median'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['urls_median'])

tweets.drop(columns=['retweet_median', 'reply_median', 'favorite_median', 'hashtags_median', 'mentions_median', 'urls_median'], \
            inplace=True)


### Fill the remained NaN values with the median of the attribute

In [None]:
tweets['retweet_count'].replace(math.nan, tweets['retweet_count'].median(), inplace=True)
tweets['reply_count'].replace(math.nan, tweets['reply_count'].median(), inplace=True)
tweets['favorite_count'].replace(math.nan, tweets['favorite_count'].median(), inplace=True)
tweets['num_hashtags'].replace(math.nan, tweets['num_hashtags'].median(), inplace=True)
tweets['num_mentions'].replace(math.nan, tweets['num_mentions'].median(), inplace=True)
tweets['num_urls'].replace(math.nan, tweets['num_urls'].median(), inplace=True)

## Replacing invalid 'created_at' with symbolic date (01/01/2000)

In [None]:
tweets.loc[(tweets.created_at < datetime.strptime("2006-07-15 00:00:00", "%Y-%m-%d %H:%M:%S")) | \
    (tweets.created_at > datetime.now()),'created_at'] = datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")

## Manage outliers

In [None]:
tweets.loc[tweets['retweet_count'] > tweets['retweet_count'].quantile(.99), 'retweet_count'] = 0
tweets.loc[tweets['reply_count'] > tweets['reply_count'].quantile(.99), 'reply_count'] = 0
tweets.loc[tweets['favorite_count'] > tweets['favorite_count'].quantile(.99), 'favorite_count'] = 0
tweets.loc[tweets['num_hashtags'] > tweets['num_hashtags'].quantile(.99), 'num_hashtags'] = 0
tweets.loc[tweets['num_mentions'] > tweets['num_mentions'].quantile(.99), 'num_mentions'] = 0
tweets.loc[tweets['num_urls'] > tweets['num_urls'].quantile(.99), 'num_urls'] = 0

## Tweets indicators

### Number of special character:

In [None]:
special = '[(|!£$%&/=?^@#§,.;:-_<>ç@)]' # Define special characters
tweets['special'] = tweets['text'].str.count(special) # Count them
tweets['special'] = tweets['special'].fillna(0)
tweets['special'] = tweets['special'].astype(int)

### Text lenght:

In [None]:
tweets['text_lenght'] = tweets['text'].str.len()
tweets['text_lenght'] = tweets['text_lenght'].fillna(0)
tweets['text_lenght'] = tweets['text_lenght'].astype(int)

## Write a new file '.csv' with all the modify done

In [None]:
tweets.to_csv('new_tweets.csv')

# User data Preparation

This part use the previous modify done to the tweets dataframe.

In [None]:
users = pd.read_csv('users.csv')

In [None]:
users.info()
users.dtypes

## Column type conversion

In [None]:
users['id'] = pd.to_numeric(users['id'], errors='coerce')
users['statuses_count'] = pd.to_numeric(users['statuses_count'], errors='coerce')
users['created_at'] = pd.to_datetime(users['created_at'], errors='coerce')

## Elimination of negative values

In [None]:

users['statuses_count'] = users['statuses_count'].abs()

## Lang Correction

In [None]:
users['lang'] = users['lang'].str.lower()
users.rename(columns={
    'id' : 'user_id',
    'created_at' : 'subscribing_date'
}, inplace=True)
nal = users[(users['lang'] == 'select language...') | (users['lang'] == 'xx-lc')]

text_nal = tweets.merge(nal, on='user_id')
text_nal

In [None]:
text_nal[text_nal['name'] == 'Leanne Arker'].head(3)

##### Users without a language are en

In [None]:
users['lang'].replace('select language...', 'en', inplace=True)
users['lang'].replace('xx-lc', 'en', inplace=True)

## Calculate for each user how many tweets we have in tweets.csv

In [None]:

users = users.merge(tweets[['user_id', 'id']].groupby('user_id', as_index=False).count(), on='user_id')
users = users.rename(columns={'id' : 'count'})
users.head(3)

## Summing parameters in users (likes received, retweet and reply received)

In [None]:
new_tweets = tweets.filter(['user_id', 'retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions'], axis=1)
new_users = users.merge(new_tweets.groupby('user_id').sum(), on='user_id')
new_users = new_users.rename(columns={
    'retweet_count' : 'retweet_received',
    'reply_count' : 'reply_received',
    'favorite_count' : 'favorite_received',
    'num_hashtags' : 'hashtag_used',
    'num_urls' : 'urls_used',
    'num_mentions' : 'mentions_used'}, 
    errors='raise'
)
new_users.head()

In [None]:
new_users[\
    (new_users['retweet_received'].isna() == True) |\
    (new_users['reply_received'].isna() == True) |\
    (new_users['favorite_received'].isna() == True) |\
    (new_users['hashtag_used'].isna() == True) |\
    (new_users['urls_used'].isna() == True) |\
    (new_users['mentions_used'].isna() == True)
        ].head()

In [None]:
new_users['reply_received'] = new_users['reply_received'].fillna(-1)
new_users['retweet_received'] = new_users['retweet_received'].fillna(-1)
new_users['statuses_count'] = new_users['statuses_count'].fillna(-1)

In [None]:
new_users['reply_received'] = new_users['reply_received'].astype(int)
new_users['retweet_received'] = new_users['retweet_received'].astype(int)
new_users['favorite_received'] = new_users['favorite_received'].astype(int)
new_users['hashtag_used'] = new_users['hashtag_used'].astype(int)
new_users['urls_used'] = new_users['urls_used'].astype(int)
new_users['mentions_used'] = new_users['mentions_used'].astype(int)
new_users['statuses_count'] = new_users['statuses_count'].astype(int)

## Ratio

In [None]:
new_users['like/statuses'] = round(new_users['favorite_received'] / new_users['count'], 2)
new_users.head(3)
users = new_users

## Write new file csv

In [None]:
users.to_csv('new_users.csv')