In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("tweets_01-08-2021.csv", index_col="id", parse_dates=['date'])
data.sample(5)

In [None]:
data = data.replace({'isRetweet' : {'f' : 0, 't' : 1},
             'isDeleted' : {'f' : 0, 't' : 1},
             'isFlagged' : {'f' : 0, 't' : 1}})

In [None]:
data.shape

In [None]:
from nltk.tokenize import RegexpTokenizer
import re

text = data.text.to_list()
processed_tweets = []
tokenizer = RegexpTokenizer('\w+|\S+')
for tweet in text:
    tweet = re.sub('(https?:[\w\/\.\d]+)|…|(^RT)|“|”|"', "", tweet)
    tweet = re.sub("&amp;?", "and", tweet)
    processed_tweets.append(tokenizer.tokenize(tweet.lower()))
        

In [None]:
data.iloc[6].text.split()
#processed_tweets[4]

In [None]:
#https://www.kdnuggets.com/2019/11/markov-chains-train-text-generation.html

# chains of 2 words
k = 2
# list to hold those chains
sets_of_k_words = []

# create chains with k-length
for tweet in processed_tweets:
    sets_of_k_words.append([' '.join(tweet[i:i+k]) for i, _ in enumerate(tweet[:-k+1])])

from scipy.sparse import dok_matrix
# how many unique chains are there?
distinct_sets_of_k_words = list(set([chain for s in sets_of_k_words for chain in s]))
# how many unique words are there?
distinct_words = list(set([elem for l in processed_tweets for elem in l]))
# sparse matrix: rows - k-length sequence, columns - all possible words in tweets
next_after_k_words_matrix = dok_matrix((len(distinct_sets_of_k_words), len(distinct_words)), dtype=np.uint16)

# to look up the index of a chain (row) for the matrix
k_words_idx_dict = {word: i for i, word in enumerate(distinct_sets_of_k_words)}
# to look up the index of a word (column) for the matrix
word_idx_dict = {word: i for i, word in enumerate(distinct_words)}

# for each sequence go over all tweets and find the next word
# increment the count for that word
for i, set_of_k_words in enumerate(sets_of_k_words):
    for j, k_word in enumerate(set_of_k_words[:-k+1]):
        # index for a row (chain)
        word_sequence_idx = k_words_idx_dict[k_word]
        # get the index for next w
        next_word_idx = word_idx_dict[processed_tweets[i][j+k]]
        next_after_k_words_matrix[word_sequence_idx, next_word_idx] +=1

In [None]:
# simple case with 0 alpha and fixed length

from scipy.sparse import csr_matrix
from numpy.random import choice

def sample_next_word_after_sequence(word_sequence, alpha = 0):
    next_word_vector = next_after_k_words_matrix[k_words_idx_dict[word_sequence]] + alpha
    likelihoods = csr_matrix(next_word_vector)/next_word_vector.sum()
    weights = likelihoods.toarray().flatten()
    # if no words possible - terminate
    if weights.sum() == 0.0:
        return ""
    return choice(distinct_words, p=weights)
    
def stochastic_chain(seed, chain_length=10, seed_length=k):
    current_words = seed.split(' ')
    if len(current_words) != seed_length:
        raise ValueError(f'wrong number of words, expected {seed_length}')
    sentence = seed

    for _ in range(chain_length):
        sentence += ' '
        next_word = sample_next_word_after_sequence(' '.join(current_words))
        if next_word == "":
            return sentence
        sentence += next_word
        current_words = current_words[1:]+[next_word]
    return sentence
  
stochastic_chain(choice(distinct_sets_of_k_words), chain_length=15)

In [None]:
# generating CDF
import matplotlib.pyplot as plt

lens = np.sort([len(tweet) for tweet in processed_tweets])
print(lens[-1])

def gen_prob(val, lens=lens):
    for i, elem in enumerate(lens):
        if elem >= val:
            return i / len(lens)
    return 1.00
 
fig, ax = plt.subplots()
ax.plot(np.cumsum(lens) / np.cumsum(lens)[-1])
ax.set_title("CDF for tweet length")
ax.set_xlim(0, len(lens))
ax.set_ylim(0, 1)
plt.show()

In [None]:
from numpy.random import random
from nltk.tokenize import sent_tokenize

def sample_next_word_after_sequence(word_sequence, alpha = 0):
    # generate a random word by chance
    if random() <= alpha:
        return distinct_words[choice(len(distinct_words)-1)]
    next_word_vector = next_after_k_words_matrix[k_words_idx_dict[word_sequence]]
    likelihoods = csr_matrix(next_word_vector)/next_word_vector.sum()
    weights = likelihoods.toarray().flatten()
    if weights.sum() == 0.0:
        return ""
    return choice(distinct_words, p=weights)

def stochastic_chain(seed, alpha=0):
    # if only 1 word provided
    if len(seed.split(' ')) != k:
        # complete the chain
        possible_words = [s for s in distinct_sets_of_k_words if s.startswith(seed)]
        seed = choice(possible_words)           
    current_words = seed.split(' ')  
    sentence = seed

    while(1):
        sentence += ' '
        next_word = sample_next_word_after_sequence(' '.join(current_words), alpha)
        if next_word == "":
            return postprocess(sentence)
        elif next_word in list(".!?"):
            sentence += next_word
            if random() <= gen_prob(len(sentence.split())):
                return postprocess(sentence)
        else:
            sentence += next_word
        current_words = current_words[1:]+[next_word]
        
def postprocess(sent):
    final = []
    for s in sent_tokenize(sent):
        final.append(re.sub(r" ([.?!])$", r"\1", s).capitalize())
    return " ".join(final)
        
stochastic_chain('twitter', alpha=0.001)

In [None]:
# funny ones k=2
'@ojmart thank you @stevedaines for being named , by virtue of the debate . no better person for the people ! love the state of michigan .'
'wonderful president was greatly helped by tariffs from china . we will hopefully come through with his candidacy ! #maga #imwithyou '
'Democrats pushing the radical left! #sc01'
'Obama fair and square. 5 :30 pm est on @nbc. Enjoy! @foxnews just wrote a story on women.'
'Obama betrays israel yet again! #trump2016 ~ @usplaymoney @realdonaldtrump @beny_benson'
'Republicans prepare to #kag!'

In [None]:
# k=3
"After 2 years of action – it ’s a primary record for a sitting president , you 're off to a good provider"
'Forget that joe biden did in 47 years. A vicious killer who destroyed so many great things ,he has woke america up and people are talking about.'

In [None]:
# k =4
'Pelosi says she got set up by the obama administration.'