In [28]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from datetime import datetime 
from collections import defaultdict
from scipy.stats import pearsonr

In [49]:
#create a pandas dataframes from csv files
users = pd.read_csv('users.csv')
tweets = pd.read_csv('tweets.csv')

In [None]:
users.info()
users.dtypes

In [None]:
tweets.info()
tweets.dtypes

## Trasformazione delle colonne in tipi

In [3]:
tweets['id'] = pd.to_numeric(tweets['id'], errors='coerce')
tweets['user_id'] = pd.to_numeric(tweets['user_id'], errors='coerce')
tweets['retweet_count'] = pd.to_numeric(tweets['retweet_count'], errors='coerce')
tweets['reply_count'] = pd.to_numeric(tweets['reply_count'], errors='coerce')
tweets['favorite_count'] = pd.to_numeric(tweets['favorite_count'], errors='coerce')
tweets['num_hashtags'] = pd.to_numeric(tweets['num_hashtags'], errors='coerce')
tweets['num_mentions'] = pd.to_numeric(tweets['num_mentions'], errors='coerce')
tweets['num_urls'] = pd.to_numeric(tweets['num_urls'], errors='coerce')
tweets['created_at'] = pd.to_datetime(tweets['created_at'], errors='coerce')

users['id'] = pd.to_numeric(users['id'], errors='coerce')
users['created_at'] = pd.to_datetime(users['created_at'], errors='coerce')

In [None]:
users.lang.unique()

## User analysis 

In [None]:
users['lang'].value_counts().plot(kind='bar', title='Languages count')
plt.yscale('log')
plt.show()

#### Lang Analysis

In [None]:
users['lang'] = users['lang'].str.lower()
users['lang'].value_counts().plot(kind='bar', title='Languages count')

users.rename(columns={
    'id' : 'user_id',
    'created_at' : 'subscribing_date'
}, inplace=True)
nal = users[(users['lang'] == 'select language...') | (users['lang'] == 'xx-lc')]

text_nal = tweets.merge(nal, on='user_id')
text_nal

In [None]:
text_nal[text_nal['name'] == 'Leanne Arker']

Visti i tweet tutti so inglesi

In [None]:
users['lang'].replace('select language...', 'en', inplace=True)
users['lang'].replace('xx-lc', 'en', inplace=True)

### Bot Analysis

Check if there are only 0 and 1 values

In [None]:
users.bot.unique()

### Statues Count analysis

Calculate for each user how much tweets we have in tweets.csv

In [None]:

count = tweets.groupby('user_id', as_index=False)['user_id'].count()

#cnt['count'] = tweets.groupby(by='user_id', as_index=False, dropna=False)['user_id'].count()
c = users
c.sort_values('user_id')
c['count'] = count
c['statuses_count'].sum() - c['count'].sum()

#max = tweets.groupby('user_id', as_index=False).count()
#max['max'] = max.max(axis=0)
#max

In [None]:
c['count'].median()

### User's created at analysis (previously we changed the name of the attribute in 'subscribing_date')

In [None]:
users[users['subscribing_date'] > datetime.now()]

In [None]:
users[users['subscribing_date'] < datetime.strptime("2006-07-15 00:00:00", "%Y-%m-%d %H:%M:%S")]

Non cene di timestamp sbagliati in italia

### Resulting Plots

In [None]:
fig = plt.figure(figsize=(20,23))
fig_dims = (4,2)
fig.subplots_adjust(hspace=0.4, wspace=0.4)

###fig 1
plt.subplot2grid(fig_dims, (0,0))
users['lang'].value_counts().plot(kind='bar', title='Languages count')
plt.yscale('log')

###fig 2
plt.subplot2grid(fig_dims, (0,1))
users['bot'].value_counts().plot(kind='bar', title='Bots count')

plt.show()

## Tweets Analysis

### Drop duplicates

In [44]:
count = tweets['id'].count()
tweets.drop_duplicates(inplace=True)
print("Dropped " + str(count - tweets['id'].count()) + " duplicates")

Dropped 1952099 duplicates


### Replace NaN and 'inf' values in 'id'

In [59]:
tweets[(tweets['id'].isna() == True) | (tweets['id'] == 'inf')]

Unnamed: 0,id,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
7801213,inf,1546896846,,,,,0.0,0.0,2020-03-08 08:22:28,Anas 7yo was injured by a rubber bullet (fired...
9706367,,466475273,,,,,,,2019-09-08 07:09:46,
9878261,inf,215693377,,,,,0.0,,2019-11-03 02:51:31,It's not whether you get knocked down; it's wh...
12171674,,87334609,,,,,,,2019-04-13 01:10:44,
13105703,inf,177455411,,,,,0.0,2.0,2020-04-06 15:12:11,@briOKC @NewsOK


In [64]:
tweets.loc[(tweets['id'].isna() == True) , 'id'] = -1
tweets.loc[(tweets['id'] == 'inf'), 'id'] = -1
tweets[tweets['id'] == -1]

Unnamed: 0,id,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
7801213,-1,1546896846,,,,,0.0,0.0,2020-03-08 08:22:28,Anas 7yo was injured by a rubber bullet (fired...
9706367,-1,466475273,,,,,,,2019-09-08 07:09:46,
9878261,-1,215693377,,,,,0.0,,2019-11-03 02:51:31,It's not whether you get knocked down; it's wh...
12171674,-1,87334609,,,,,,,2019-04-13 01:10:44,
13105703,-1,177455411,,,,,0.0,2.0,2020-04-06 15:12:11,@briOKC @NewsOK


### Replace wrong values in 'user_id'

In [84]:
len(tweets[(tweets['user_id'].isna() == True)])

217283

In [95]:
tweets.loc[(tweets['user_id'].isna() == True), 'user_id'] = -1
len(tweets[(tweets['user_id'].isna() == True)])

0

### Infer num_hashtags, num_urls and num_mentions

In [96]:
tweets['infer_hashtags'] = tweets.text.str.count('#')
tweets['infer_mentions'] = tweets.text.str.count('@')
tweets['infer_urls'] = tweets.text.str.count('http://')

print("num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      " num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      " num_urls nan: " + str(tweets['num_mentions'].isna().sum()))

tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['infer_hashtags'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['infer_mentions'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['infer_urls'])

print("num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      " num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      " num_urls nan: " + str(tweets['num_mentions'].isna().sum()))


num_hashtags nan: 1057524 num_mentions nan: 854165 num_urls nan: 854165
num_hashtags nan: 488757 num_mentions nan: 487992 num_urls nan: 487992


### Replace NaN values in tweets

In [None]:
t = tweets
user_median = t.groupby('user_id', as_index=False).median()
user_median.rename(columns={
    'retweet_count' : 'retweet_median',
    'reply_count' : 'reply_median',
    'favorite_count' : 'favorite_median',
    'num_hashtags' : 'hashtags_median',
    'num_urls' : 'urls_median',
    'num_mentions' : 'mentions_median'
}, inplace=True)
user_median.drop(columns=['id'], inplace=True)
tweets = t.merge(user_median, on='user_id')

print("retweet_count nan: " + str(tweets['retweet_count'].isna().sum()) + \
      ", reply_count nan: " + str(tweets['reply_count'].isna().sum()) + \
      ", favorite_count nan: " + str(tweets['favorite_count'].isna().sum()) + \
      ", num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      ", num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      ", num_urls nan: " + str(tweets['num_urls'].isna().sum()))

#sostituzione valori
tweets['retweet_count'] = tweets['retweet_count'].fillna(tweets['retweet_median'])
tweets['reply_count'] = tweets['reply_count'].fillna(tweets['reply_median'])
tweets['favorite_count'] = tweets['favorite_count'].fillna(tweets['favorite_median'])
tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['hashtags_median'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['mentions_median'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['urls_median'])

tweets.drop(columns=['retweet_median', 'reply_median', 'favorite_median', 'hashtags_median', 'mentions_median', 'urls_median'])

print("retweet_count nan: " + str(tweets['retweet_count'].isna().sum()) + \
      ", reply_count nan: " + str(tweets['reply_count'].isna().sum()) + \
      ", favorite_count nan: " + str(tweets['favorite_count'].isna().sum()) + \
      ", num_hashtags nan: " + str(tweets['num_hashtags'].isna().sum()) + \
      ", num_mentions nan: " + str(tweets['num_mentions'].isna().sum()) + \
      ", num_urls nan: " + str(tweets['num_urls'].isna().sum()))


### Replacing invalid timestamp

In [None]:
tweets.loc[(tweets.created_at < datetime.strptime("2006-07-15 00:00:00", "%Y-%m-%d %H:%M:%S")) | (tweets.created_at > datetime.now()),\
     'created_at'] = datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")
tweets[tweets['created_at'] == datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")]['id'].count()