# Extract information for the speaker landscape

Running this notebook will read in the `word_embedding.emb` and `clean_data.txt` files to extract information that is important to visualise and analyse the speaker landscape.

The information is stored in a pandas dataframe and the file `landscape_info.pkl` and can be read into a dataframe using `output = pd.read_pickle("landscape_info.pkl")`.

In [1]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import umap.umap_ as umap

  from .autonotebook import tqdm as notebook_tqdm


## 1. Create a dataframe with authors, their tweets and vector representations

In [4]:
umap_seed = 42
retain_threshold = 0 # The PAPER used 15

In [2]:
embedding = KeyedVectors.load("word_embedding.emb")

In [5]:
# Turn data text file into pandas dataframe

quotes = []
agents = []

with open("clean_data.txt", "r") as f:
    for line in f:
        l = line.strip().split(" ")
        if l and l[1:]:
            agents.append(l[0])
            quotes.append([" ".join(l[1:])])

df = pd.DataFrame({"author": agents, "quotes": quotes})

# summarise same agents
df = df.groupby(["author"], as_index=False).agg({'quotes': 'sum'})
print("Number of agents in training set: ", len(df.index))

# only take agents with more than so many tweets
df = df[df.quotes.map(len) > retain_threshold]
print("Number of agents with more than " + str(retain_threshold) + " tweets: ", len(df.index))

# store the vector representation of the agent
df["vec"] = df.apply(lambda row: embedding[row.author], axis=1)

Number of agents in training set:  442
Number of agents with more than 0 tweets:  442


In [6]:
# reduce vectors to 2-d representation using UMAP and add to the dataframe

vecs = df["vec"].tolist() 
reducer = umap.UMAP(metric="cosine", min_dist=0.01, n_neighbors=40, random_state=umap_seed)
smaller_vecs = reducer.fit_transform(vecs)

df["low_dim_vec"] = list(smaller_vecs)

df

Unnamed: 0,author,quotes,vec,low_dim_vec
0,agent_0sternchen,[@mountaindream5 @spaet68er @zuma_monty nur fü...,"[-0.012958794, -0.031758953, -0.028638883, 0.0...","[1.5367218, 6.6103783]"
1,agent_1st_rins,[@zuma_okemaru @1st_rins 1st_rins],"[-0.0073439083, -0.023276674, 0.0032795235, 0....","[3.6051142, 7.071133]"
2,agent_80pfarelo,[@frasimphi @coruscakhaya you will follow this...,"[-0.0310981, -0.10229473, -0.009040911, 0.0498...","[8.01554, 6.591486]"
3,agent___xmo4,[@zuma_okemaru ご飯食べたらあそぼ！],"[-0.005339413, -0.027761472, 8.615251e-05, 0.0...","[4.36039, 6.634388]"
4,agent__africansoil,[💻pres zuma discussion with the top 6 presiden...,"[-0.017641576, -0.09126312, 0.033324387, 0.060...","[5.116426, 4.220506]"
...,...,...,...,...
437,agent_zenande_monegi,[@mugabebobby @flawmade @100kmokone @mightijam...,"[-0.006726697, -0.072259985, -0.008394595, 0.0...","[7.759946, 5.646287]"
438,agent_zukile_lize,[@advobarryroux zuma is the worst to tell us t...,"[-0.008673304, -0.099911675, 0.035729747, 0.05...","[5.5338826, 3.4033587]"
439,agent_zuma0240,[参加型第5人格！！サバイバー達我が勝利への糧となれ！ランク戦まで！ #identityv ...,"[-0.01504922, -0.029157735, -0.011365483, 0.00...","[2.1843588, 6.8459115]"
440,agent_zuma_0807,[2021年2月27日 zuma_0807さんがnew眠しました。 時刻 615 入眠潜時 ...,"[-0.020659124, -0.027711466, -0.012286215, 0.0...","[2.200496, 6.8264565]"


In [45]:
df.to_pickle("landscape_info.pkl")

## 2. Create a dataframe with annotating words and their vector representation

In [46]:
# Extract hashtags as an example of annotating words

vocab = embedding.index_to_key
hashtags = [word for word in vocab if "#" in word]

In [47]:
# compute large and small vector representation
hashtags_vecs = [embedding[h] for h in hashtags]
hashtags_low_dim_vecs = list(reducer.transform(np.array(hashtags_vecs)))

In [48]:
df_annotations = pd.DataFrame({"word": hashtags, "vec": hashtags_vecs, "low_dim_vec": hashtags_low_dim_vecs})
df_annotations.to_pickle("annotations_info.pkl")