# Exercise 3

### Necessary imports

In [6]:
import pandas as pd
import nltk
from nltk.util import ngrams
import random

### Dataset loading

In [7]:
# I must read the dataset without pandas because some lines were skipped

df = pd.DataFrame(columns=['sentences'])
with open("./dataset/tweets.csv", 'r', encoding="utf8") as file:
    for line in file:
        df.loc[len(df)] = str(line).split(',', 2)[1].replace('&amp;', '&')

df_tokenized = pd.DataFrame(columns=['word'])
for text in df['sentences']:
    tokens = text.split()
    for token in tokens:
        df_tokenized.loc[len(df_tokenized)] = token
words_count = df_tokenized.stack().value_counts()

### Creating the bi-gram and tri-gram models

In [8]:
def generate_grams(text, n):
    tokens = text.split()
    return list(ngrams(tokens, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

### Calculating the counts of the bi-gram and tri-gram models

In [9]:
bigram = df['sentences'].apply(lambda x: generate_grams(x, 2)).tolist()
trigram = df['sentences'].apply(lambda x: generate_grams(x, 3)).tolist()

bigram = [item for sublist in bigram for item in sublist]
trigram = [item for sublist in trigram for item in sublist]

bigram_count = nltk.FreqDist(bigram)
trigram_count = nltk.FreqDist(trigram)

### Calculating the probabilities of the bi-gram and tri-gram models

In [10]:
bigram_prob = []
trigram_prob = []

for bigram in bigram_count.items():
    if bigram[0][0] == '<s>':
        bigram_prob.append((bigram[0], bigram[1] / len(df)))
    else:
        bigram_prob.append((bigram[0], bigram[1] / words_count[bigram[0][0]]))

for trigram in trigram_count.items():
    if trigram[0][0] == '<s>' and trigram[0][1] == '<s>':
        trigram_prob.append((trigram[0], trigram[1] / len(df)))
    else:
        trigram_prob.append((trigram[0], trigram[1] / bigram_count[trigram[0][:2]]))

### I use the two models to generate 50 tweets

In [11]:
# I use the two models to generate 50 tweets
final_tweets = []

# Bigram model
for i in range(50):
    initial_bigram = random.choices([bigram[0] for bigram in bigram_prob if bigram[0][0] == '<s>'], [bigram[1] for bigram in bigram_prob if bigram[0][0] == '<s>'])[0]
    sentence = [initial_bigram]
    while sentence[len(sentence) - 1][1] != '</s>':
        possible_bigram = []
        for bigram in bigram_prob:
            if bigram[0][0] == sentence[len(sentence) - 1][1]:
                possible_bigram.append(bigram)
        sentence.append(random.choices(possible_bigram, [bigram[1] for bigram in possible_bigram])[0][0])
    final_tweets.append(sentence)

# Print the bigram model tweets
print("-------- Bigram model tweets --------")
for tweet in final_tweets:
    sentence = ""
    for word in tweet:
        if word[0] == '<s>':
            sentence += word[1] + " "
        elif word[1] != '</s>':
            sentence += word[1] + " "
    print(sentence)
print("-------- End of bigram model tweets --------")

final_tweets = []

# Trigram model
for i in range(50):
    initial_trigram = random.choices([trigram[0] for trigram in trigram_prob if trigram[0][0] == '<s>' and trigram[0][1] == '<s>'], [trigram[1] for trigram in trigram_prob if trigram[0][0] == '<s>' and trigram[0][1] == '<s>'])[0]
    sentence = [initial_trigram]
    while sentence[len(sentence) - 1][2] != '</s>':
        possible_trigram = []
        for trigram in trigram_prob:
            if trigram[0][0] == sentence[len(sentence) - 1][1] and trigram[0][1] == sentence[len(sentence) - 1][2]:
                possible_trigram.append(trigram)
        sentence.append(random.choices(possible_trigram, [trigram[1] for trigram in possible_trigram])[0][0])
    final_tweets.append(sentence)

# Print the trigram model tweets
print("-------- Trigram model tweets --------")
for tweet in final_tweets:
    sentence = ""
    for word in tweet:
        if word[1] != '<s>':
            sentence += word[1] + " "
        elif word[2] == '</s>':
            sentence += word[1]
    print(sentence)
print("-------- End of trigram model tweets --------")


-------- Bigram model tweets --------
What happened to you hate him on America. Get some DJT ties and much more. I feel sorry for PRESIDENT will fire @DannyZuker @realDonaldTrump how a person reacts to stop calling ISIS leaders "MASTERMINDS." Call them Mr. Trump. You really bad. John Weaver lost for them." LOSERS! 
“Nancy Pelosi cares more dangerous Lottery. 
Why do with dopey Jon Stewart is as one bit haha. Do Nothing Dems are win… 
Congratulations to get near their real name or get out for mentions! 
While Jon Stewart is a husband and losers! 
Mini Mike Bloomberg called me too stupid to admire genius! 
"@PureManhattan: @DannyZuker on. A.) Not famous enough. C) Hates losers. 
Lightweight @AGSchneiderman’s phony last us Barack Obama. Loser. When he was incredible - a Marine nor ever RT“@realDonaldTrump: I love whose parents are not successful like @secupp a person reacts to you in London not done in coal now will miss him for his own account/lawyer who went around for the haters and tr