# __Tweet Generator using Markov's Chain__

In [1]:
import numpy as np

def get_sorted_vocab(words): return sorted(np.unique(words))
    
def get_encoding(words): 
    vocabulary_sorted = get_sorted_vocab(words)
    rank = 0
    vocabulary_code = {}
    for word in vocabulary_sorted:
        vocabulary_code[word] = rank
        rank+=1
    return vocabulary_code

def _get_first_key(val, my_dict):
    for key, value in my_dict.items():
         if val == value:
            return key
    return "key not found"

def _get_all_keys(val, my_dict):
    keys = []
    for key, value in my_dict.items():
         if val == value:
            keys.append(key)
    return keys

def encode(words, vocabulary_code):
    return list(map(lambda w: vocabulary_code.get(w), words))

def decode(codes, encoding):
    return list(map(lambda w: _get_first_key(w, encoding), codes))


## __Public Tweet Dataset__

In [2]:
import pandas as pd

tweets = pd.read_csv('data/training.1600000.processed.noemoticon.zip', compression='zip', encoding='latin1')
tweets.columns = ['target','ids','date','flag','user','text']
tweets.head(5)

# Data Source: https://www.kaggle.com/kazanova/sentiment140

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [3]:
# Possible Extentions
# Utilize hastags and mentions

def preprocessing(text):
    # Remove URLs
    processed_text = re.sub(r'http\S+', '', text)
    processed_text = processed_text.replace(".", " ") \
                                    .replace("-", " ")\
                                    .replace("0", "")\
                                    .replace("1", "")\
                                    .replace("2", "")\
                                    .replace("3", "")\
                                    .replace("4", "")\
                                    .replace("5", "")\
                                    .replace("6", "")\
                                    .replace("7", "")\
                                    .replace("8", "")\
                                    .replace("9", "")
    return processed_text.lower()

## __Text Preprocessing__

In [4]:
# Text Cleaning

import string
import re

from collections import Counter

tweets['processed_text'] = tweets.text.apply(preprocessing)
raw_text = ' '.join(list(tweets.processed_text.sample(frac=0.0001, replace=False)))

# Text Tokenization
raw_words = raw_text.split()

word_counter = Counter(raw_words)

single_words = _get_all_keys(1, word_counter)

# To avoid pitfall of sinks in markov chains
words = raw_words + single_words
print(f'No. of words = {len(words)}')
print(f'No. of unique words = {len(set(words))}')

No. of words = 2633
No. of unique words = 936


In [5]:
raw_text[0:500]

"@alistardean  off to ocean terminal   will tweet later all     @casper that'll be cool    hihi    but i think eagles like cats too   we failed that song  probably gonna head up to may fest with @kiravonsutra after lunch  good mood gone (it better comes back soon) right now i feel rubbish     i miss you so much joe  @bluorchid where did you get the dr pepper, i love it but can't find it anywhere anymore  @bellasoul the inside of my house was much more colder than outside, too  @mediamadam thank y"

## __Text Encoding and Transition Matrix__

In [6]:
# Credit: https://stackoverflow.com/questions/46657221/generating-markov-transition-matrix-in-python
def get_transition_matrix(transitions):
    n = 1 + max(transitions) # no. of states

    M = [[0]*n for _ in range(n)]
    for (i,j) in zip(transitions,transitions[1:]):
        M[i][j] += 1

    # Extracting probabilities
    for row in M:
        s = sum(row)
        if s > 0:
            row[:] = [f/s for f in row]
    return M


print('Text encoding with sorted word\'s rank')
encoding = get_encoding(words)
words_encoded = encode(words, encoding)

print('Building transition matrix ....')
p = get_transition_matrix(words_encoded)

print('Running sanity check of transition matrix (p)')
for row in p: 
    if sum(row)== 0:
        print(row)
        raise Exception('Try to remove last word from text')

print('SUCCESS! Dataset ready ....')
# Edge Case: 
# If the frequency of last word in text is 1 then this condition would not be satisfied
# The "p" (transition matrix) parameter rows must sum to 1.


Text encoding with sorted word's rank
Building transition matrix ....
Running sanity check of transition matrix (p)
SUCCESS! Dataset ready ....


## __Building Markov Chain__

In [7]:
from pydtmc import MarkovChain
# from pydtmc import plot_graph
# from pydtmc import plot_eigenvalues
# from pydtmc import plot_walk

mc = MarkovChain(p, get_sorted_vocab(words))
print(mc)

# When chain is small, better plot
# import matplotlib.pyplot as plt
# %matplotlib inline
# plot_graph(mc)
# plot_eigenvalues(mc)


DISCRETE-TIME MARKOV CHAIN
 SIZE:           936
 RANK:           921
 CLASSES:        1
  > RECURRENT:   1
  > TRANSIENT:   0
 ERGODIC:        YES
  > APERIODIC:   YES
  > IRREDUCIBLE: YES
 ABSORBING:      NO
 REGULAR:        NO
 REVERSIBLE:     NO
 SYMMETRIC:      NO



## __Tweet Generator__

In [30]:
import random

def post_processing_tweet(tweet):
    emojis = [' !',' ....',' ;)', ' :(', ' Ö']
    end = random.sample(emojis, 1)[0]
    return tweet[0].upper() + tweet[1:] + end

# Start random walk of x given steps in markov chain starting from given word
tweet = ' '.join(mc.walk(15))
post_processing_tweet(tweet)

"It was sold out before even heard coming sucks, but can't find it congratulations, queen Ö"