In [None]:
import os
import re
import json
import string
import random
import numpy as np
import pandas as pd
from time import time
from nltk.tokenize import word_tokenize

# Importing The Data

In [None]:
csv_links = []
count = 0

for dirname, _, filenames in os.walk('Data'):
    for filename in filenames:
        count+=1
        if count >= 25:
            break
        fullpath= os.path.join(dirname, filename)
        csv_links.append(fullpath)

csv_links = csv_links[:5]
random.Random().shuffle(csv_links)

df = pd.read_csv(csv_links.pop(),compression = 'gzip', index_col=0)
for data in csv_links:
    try:
        tmp = pd.read_csv(data, compression = 'gzip', index_col=0)
        df = pd.concat([df, tmp], axis=0)
    except:
        print("Error")
        

1. **EXTRACTING ONLY ENGLISH TWEETS**
2. **EXTRACTING ONLY TEXT AND RETWEETCOUNT COLUMN**
3. **GETTING ONLY TWEETS WHICH HAVE MORE THAN 5000 RETWEETS**

In [None]:
df_en = df[df.language == 'en'].drop('language', axis=1)
sorted_tweets = df_en.loc[df_en['retweetcount'] > 1000,['text','retweetcount']].sort_values(by = 'retweetcount', ascending=False)

# CLEANING TEXT
**REMOVING LINKS, URLS, HASTAGS, NON ASCII LETTERS, PUNCTUATIONS**

In [None]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"http\S+", "", line)
        line = re.sub(r"  ", "", line)
        line = re.sub("#[A-Za-z0-9_]+", "", line)
        line = re.sub(r"[\"\.,'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_tweets = clean_txt(sorted_tweets["text"])
print("number of words = ", len(cleaned_tweets))

In [None]:
print(" ".join(cleaned_tweets))

In [None]:
def make_markov_model(cleaned_tweets, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_tweets)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_tweets[i+j] + " "
            try:
                next_state += cleaned_tweets[i+j+n_gram] + " "
            except:
                break
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

# TRANING OUR MODEL

In [None]:
start_time = time()
markov_model = make_markov_model(cleaned_tweets)
print(f"Run time for training the generator : {round(time()-start_time, 2)} seconds")

# IMPORTING OUR TRAINED MODEL

In [None]:
a_file = open("./model/data_small.json")
markov_model = json.loads(a_file.read())

In [None]:
print("number of states = ", len(markov_model.keys()))
#print("First 10 states are", [state for state in list(markov_model.items()) if "russia is" in state[0] ])
print("First 10 states are", list(markov_model.items())[:10])

# SAVING OUR TRAINED MODEL

In [None]:
a_file = open("./model/data_modified.json", "w")
json.dump(markov_model, a_file)
a_file.close()

# GENERATING TWEET

In [None]:
def generate_tweet(markov_model, limit=100, start="russia is"):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        try:
            next_state = random.choices(list(markov_model[curr_state].keys()),list(markov_model[curr_state].values()))
        except:
            print("Starting Word Is Not Present")
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [None]:
for i in range(1,20):
    print(str(i)+". ", generate_tweet(markov_model, start="usa is", limit=random.randint(10,20)))