In [1]:
import pandas as pd
import plotly.express as px
import gensim
from collections import defaultdict
import numpy as np

ModuleNotFoundError: No module named 'plotly'

# User Level Analysis
The goal of this notebook is to analyze data on a user level.

In [6]:
users1 = pd.read_csv("data/users1.csv")
users2 = pd.read_csv("data/users2.csv")
users3 = pd.read_csv("data/users3.csv")

In [7]:
users = pd.concat([users1, users2, users3])

In [8]:
tweets1 = pd.read_csv("data/tweets1.csv")
tweets2 = pd.read_csv("data/tweets2.csv")
tweets3 = pd.read_csv("data/tweets3_1.csv")
tweets4 = pd.read_csv("data/tweets3_2.csv")
tweets5 = pd.read_csv("data/tweets3_3.csv")


Columns (6,15,19,30) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (15,19,30) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (15) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (1,15,30) have mixed types. Specify dtype option on import or set low_memory=False.



In [9]:
tweets = pd.concat([tweets1, tweets2, tweets3, tweets4, tweets5])

First, we write function to randomly pick a user from our dataset. Since there are a users in our dataset with no recorded tweets, we will continue to choose random users until we find one.

In [10]:
def find_random_user():
    df = None
    while df is None or df.shape[0] == 0:
        user = users.sample(1)["user_display_name"].item()
        df = tweets.loc[tweets["user_display_name"] == user]
    return df

Now, we will visualize the tweet frequency. In order to proceed with the same user in our future investigation, we rerun our find_random_user function until we see English results.

In [23]:
user = find_random_user()
#First, we want to see what time this specific user tweets everyday.

In [25]:
#Visualize tweet times
user_copy = user.copy()
user_copy["tweet_time"] = pd.to_datetime(user_copy["tweet_time"]).apply(lambda x: int(x.hour))
tweet_times = user_copy.groupby("tweet_time")["userid"].count().to_frame().reset_index()
tweet_times.columns = ["hour", '# of tweets']
fig = px.line(tweet_times, x="hour", y="# of tweets")
fig.update_layout(
    title_text="@"+user["user_display_name"].iloc[0]+" Tweet Times", 
    font=dict(
        family="Times",
        size=18,
        color="#7f7f7f"
    ))
fig.show()

The goal of this part was to look for peaks of the day in which a user tweets. We expect a normal user to tweet uniformly between a 14-18 hour period.

Next, we will look at the content of the tweets, looking at the variance .

In [26]:
user["tweet_text"].head()

724983     RT @sheeraf: Here’s a story about what it took...
798480     RT @SpursCoyote: With the guys for a good caus...
916678                               https://t.co/zPgFBK1uhk
1003890                              https://t.co/RV4J8w2Tc7
1089139                              https://t.co/9W4AinvklK
Name: tweet_text, dtype: object

At first glance, it seems like this user many of the users tweet in multiple languages. We will be using the gensim package, so first we will preprocess the data.

In [27]:
# first, remove stop words
stop_words = pd.read_csv("data/stop_words.csv")["words"].tolist()
cleaned_tweets = user["tweet_text"].apply(lambda tweet: [x.lower() for x in [x.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace(" ", "").replace("*", "").replace('"', '').strip().lower() for x in tweet.split() if x not in stop_words] if x != ''])
cleaned_tweets = cleaned_tweets.tolist()

To further clean our data, we remove words with only 1 frequency, as those will likely not be as significant when we calculate similarity between documents.

In [28]:
# remove words with only 1 frequency
frequency = defaultdict(int)
for tweet in cleaned_tweets:
    for token in tweet:
        frequency[token] += 1

In [29]:
texts = [
    [token for token in tweets if frequency[token] > 1]
    for tweets in cleaned_tweets
]
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

We've preprocessed the data, but knowing that there may be stop words we have missed, that there are tweets in other languages, and that users (particularly this user) tend to repeat words, we will transform our corpus using tf-idf.

In [30]:
tfidf = gensim.models.TfidfModel(corpus, smartirs='ntc')

In [31]:
for doc in tfidf[corpus]:
    print([[dictionary[token], np.around(freq, 2)] for token, freq in doc])

[['rt', 0.01], ['story', 0.88], ['talk', 0.48]]
[['rt', 0.01], ['#gospursgo', 0.48], ['good', 0.6], ['with', 0.64]]
[]
[]
[]
[]
[]
[['rt', 0.01], ['22', 0.46], ['3', 0.46], ['5', 0.42], ['@okcthunder:', 0.32], ['https://tco…', 0.46], ['in', 0.31]]
[['rt', 0.01], ['@okcthunder:', 0.45], ['#nationalhandshakeday', 0.6], ['partner', 0.66]]
[['rt', 0.01], ['@okcthunder:', 0.3], ['christmas', 0.43], ['nba', 0.43], ['night', 0.4], ['opening', 0.43], ['thunder', 0.43]]
[['rt', 0.01], ['@karaswisher:', 0.45], ['francisco', 0.41], ['i', 0.29], ['love', 0.45], ['san', 0.37], ['strong', 0.45]]
[['rt', 0.02], ['@karaswisher:', 1.0]]
[['rt', 0.01], ['5', 0.36], ['https://tco…', 0.39], ['@cavs', 0.36], ['@kingjames', 0.34], ['game', 0.29], ['pts', 0.36], ['when', 0.39], ['|', 0.32]]
[['rt', 0.01], ['@soundcloud:', 0.32], ['first', 0.57], ['get', 0.34], ['soundcloud', 0.4], ['start', 0.37], ['your', 0.4]]
[['rt', 0.03], ['@alfredburnejon1:', 1.0]]
[['rt', 0.0], ['get', 0.23], ['fast', 0.24], ['powerfu

In [32]:
tfidf_corpus = tfidf[corpus]

In [34]:
index = gensim.similarities.MatrixSimilarity(tfidf_corpus, num_features=len(dictionary))
sims = index[tfidf_corpus]

In [49]:
sims[0]

array([1.00000000e+00, 1.05164370e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.89509179e-05,
       9.88897591e-05, 6.52322196e-05, 6.78398792e-05, 1.50307780e-04,
       5.91925091e-05, 6.01213396e-05, 2.58653890e-04, 3.96803407e-05,
       4.31218323e-05, 1.62426368e-04, 7.87234167e-05, 5.76148268e-05,
       1.07979067e-04, 1.01511963e-04, 5.52076453e-05, 6.93856200e-05,
       6.70188820e-05, 4.45996520e-05, 8.12133367e-05, 4.76690911e-05,
       9.56076183e-05, 7.42952034e-05, 5.81770582e-05, 1.22467914e-04,
       8.80326697e-05, 8.21931026e-05, 8.20923087e-05, 2.55564004e-01,
       8.46294861e-05, 8.48926647e-05, 1.73791428e-04, 7.04605191e-05,
       2.03243835e-04, 7.41878684e-05, 6.21148502e-05, 8.23514929e-05,
       8.16735337e-05, 5.80983469e-05, 8.95843768e-05, 6.56869452e-05,
       1.12447677e-04, 6.70177033e-05, 4.36484406e-05, 8.15375897e-05,
       2.64765695e-04, 1.02267681e-04, 1.74277069e-04, 1.31361783e-04,
      