In [1]:
import pandas as pd
import plotly.express as px
import gensim
from collections import defaultdict
import numpy as np
import json

# User Level Analysis
The goal of this notebook is to analyze data on a user level.

In [2]:
users1 = pd.read_csv("../data/users1.csv")
users2 = pd.read_csv("../data/users2.csv")
users3 = pd.read_csv("../data/users3.csv")

In [3]:
users = pd.concat([users1, users2, users3])

In [4]:
tweets1 = pd.read_csv("../data/tweets1.csv")
tweets2 = pd.read_csv("../data/tweets2.csv")
tweets3 = pd.read_csv("../data/tweets3_1.csv")
tweets4 = pd.read_csv("../data/tweets3_2.csv")
tweets5 = pd.read_csv("../data/tweets3_3.csv")

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
tweets = pd.concat([tweets1, tweets2, tweets3, tweets4, tweets5])

First, we write function to randomly pick a user from our dataset. Since there are a users in our dataset with no recorded tweets, we will continue to choose random users until we find one.

In [12]:
def find_random_user():
    df = None
    while df is None or df.shape[0] == 0:
        user = users.sample(1)["user_display_name"].item()
        df = tweets.loc[tweets["user_display_name"] == user]
    return df

Now, we will visualize the tweet frequency. In order to proceed with the same user in our future investigation, we rerun our find_random_user function until we see English results.

In [13]:
user = find_random_user()
#First, we want to see what time this specific user tweets everyday.

In [None]:
#Visualize tweet times
user_copy = user.copy()
user_copy["tweet_time"] = pd.to_datetime(user_copy["tweet_time"]).apply(lambda x: int(x.hour))
tweet_times = user_copy.groupby("tweet_time")["userid"].count().to_frame().reset_index()
tweet_times.columns = ["hour", '# of tweets']
fig = px.line(tweet_times, x="hour", y="# of tweets")
fig.update_layout(
    title_text="@"+user["user_display_name"].iloc[0]+" Tweet Times", 
    font=dict(
        family="Times",
        size=18,
        color="#7f7f7f"
    ))
fig.show()

The goal of this part was to look for peaks of the day in which a user tweets. We expect a normal user to tweet uniformly between a 14-18 hour period.

Next, we will look at the content of the tweets, looking at the variance .

In [None]:
user["tweet_text"].head()

In [None]:
user_copy.tweet_time.var()

In [None]:
tweets.columns

Let's look at the tweet time variance per user (using the hour convention to determin variance in user tweet habits), first let's drop all the columns we don't need for this 

In [None]:
tweets_id = tweets.loc[:,['tweetid', 'userid', 'tweet_time' ]]
tweets_id.shape

In [None]:
user_copy.tweet_time.hist()

In [None]:
# asssign tweet_time to datetime object
tweets_id['tweet_time'] = pd.to_datetime(tweets.loc[:,'tweet_time']).dt.hour
tweets_id.tweet_time

In [None]:
tweets_id.groupby('userid').tweet_time.var().hist()
print('average user variance from this df is: ' + str(tweets_id.groupby('userid').tweet_time.var().mean()))

variance seems to be pretty low, let's compare this to user data we got from https://botometer.iuni.iu.edu/bot-repository/datasets.html (verified_2019)

In [None]:
tweets_id.groupby('tweet_time').count()

In [None]:
# load json
with open('../data/verified-2019_tweets.json') as f:
    verified_json = json.load(f)

In [None]:
verified_json = [i['user'] for i in verified_json]

In [None]:
tweets_verified = pd.DataFrame.from_dict(verified_json)
tweets_verified

At first glance, it seems like this user many of the users tweet in multiple languages. We will be using the gensim package, so first we will preprocess the data.

In [36]:
# first, remove stop words
#stop_words = pd.read_csv("../data/stop_words.csv")["words"].tolist()
#cleaned_tweets = user["tweet_text"].apply(lambda tweet: [x.lower() for x in [x.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace(" ", "").replace("*", "").replace('"', '').strip().lower() for x in tweet.split() if x not in stop_words] if x != ''])
#cleaned_tweets = cleaned_tweets.tolist()

To further clean our data, we remove words with only 1 frequency, as those will likely not be as significant when we calculate similarity between documents.

In [None]:
# remove words with only 1 frequency
frequency = defaultdict(int)
for tweet in cleaned_tweets:
    for token in tweet:
        frequency[token] += 1

In [None]:
texts = [
    [token for token in tweets if frequency[token] > 1]
    for tweets in cleaned_tweets
]
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

We've preprocessed the data, but knowing that there may be stop words we have missed, that there are tweets in other languages, and that users (particularly this user) tend to repeat words, we will transform our corpus using tf-idf.

In [None]:
tfidf = gensim.models.TfidfModel(corpus, smartirs='ntc')

In [None]:
for doc in tfidf[corpus]:
    print([[dictionary[token], np.around(freq, 2)] for token, freq in doc])

In [None]:
tfidf_corpus = tfidf[corpus]

In [None]:
index = gensim.similarities.MatrixSimilarity(tfidf_corpus, num_features=len(dictionary))
sims = index[tfidf_corpus]

In [None]:
sims[0]

In [20]:
tweet_language_count = tweets.groupby('tweet_language').count()

In [28]:
#tweet_language_count['tweetid'].sort_values(ascending=False).to_dict()

In [30]:
users.columns

Index(['userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language'],
      dtype='object')

In [46]:
users.account_language

0       zh-cn
1       zh-cn
2       zh-cn
3       zh-cn
4       zh-cn
        ...  
4296    zh-cn
4297       en
4298    zh-cn
4299       en
4300       en
Name: account_language, Length: 5241, dtype: object

In [37]:
tweets_indonesian = tweets[tweets['tweet_language'] == 'in']

In [6]:
mismatched_users = tweets[tweets['tweet_language'] != tweets['account_language']]

In [13]:
mismatched_userid = mismatched_users.groupby('userid').count().index.to_list()

In [17]:
mismatched_user_acc = users[users['userid'].isin(mismatched_userid)]
mismatched_user_acc.shape, users.shape

((5179, 10), (5241, 10))

In [40]:
tweets.groupby('userid').count()

Unnamed: 0_level_0,tweetid,user_display_name,user_screen_name,user_reported_location,user_profile_description,user_profile_url,follower_count,following_count,account_creation_date,account_language,...,latitude,longitude,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,poll_choices
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54107005,74836,74836,74836,0,0,0,74836,74836,74836,74836,...,74836,74836,74836,74836,74836,74836,71489,71489,74836,0
109273574,1698,1698,1698,1698,1698,0,1698,1698,1698,1698,...,1698,1698,1698,1698,1698,1698,1516,1698,1698,0
534528879,593,593,593,593,593,0,593,593,593,593,...,593,593,593,593,593,593,547,552,593,0
824772841,39569,39569,39569,39569,39569,0,39569,39569,39569,39569,...,39569,39569,39567,39567,39567,39567,39569,39569,39569,0
907691540,321,321,321,321,321,0,321,321,321,321,...,321,321,321,321,321,321,321,321,321,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ztZzXBmqhWF5N2qG+b91QU65UAZemBn2hspUFwJjP5k=,6,6,6,0,0,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,0
zunmJ76UoF+3kyYtRDKq+ODXXMYv6xx9hkgncIy1vA=,14,14,14,14,14,0,14,14,14,14,...,14,14,14,14,14,14,14,14,14,0
zvf1RuTkvS2pnDSXmAnXeaZghEnVSp02gJ2wlW6Z2C0=,34,34,34,0,0,0,34,34,34,34,...,34,34,34,34,34,34,34,34,34,0
zx0B5rBTzMBRZWTqQmMEpRThexa1SKOqv5gPiv86zB4=,97,97,97,0,0,0,97,97,97,97,...,97,97,97,97,97,97,97,97,97,0
