# Exercise 3

### Necessary imports

In [1]:
import pandas as pd
import nltk
from nltk.util import ngrams
import random

### Dataset loading

In [2]:
# I must read the dataset without pandas because some lines were skipped

df = pd.DataFrame(columns=['sentences'])
with open("./dataset/tweets.csv", 'r', encoding="utf8") as file:
    for line in file:
        df.loc[len(df)] = str(line).split(',', 2)[1].replace('&amp;', '&')

df_tokenized = pd.DataFrame(columns=['word'])
for text in df['sentences']:
    tokens = text.split()
    for token in tokens:
        df_tokenized.loc[len(df_tokenized)] = token
words_count = df_tokenized.stack().value_counts()

### Function for creating the n-grams model

We apply also padding to the sentences, in order to predict better the probabilities of the first and last words of the sentences.

In [3]:
def generate_grams(text, n):
    tokens = text.split()
    return list(ngrams(tokens, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

### Calculating the counts of the bi-gram and tri-gram models

We generate the bi-gram and tri-gram models, for each sentence in the dataset.

Then we obtain the list of all bi-grams and tri-grams.

After that, we calculate the counts of each bi-gram and tri-gram.

In [4]:
bigram = df['sentences'].apply(lambda x: generate_grams(x, 2)).tolist()
trigram = df['sentences'].apply(lambda x: generate_grams(x, 3)).tolist()

bigram = [item for sublist in bigram for item in sublist]
trigram = [item for sublist in trigram for item in sublist]

bigram_count = nltk.FreqDist(bigram)
trigram_count = nltk.FreqDist(trigram)

### Calculating the probabilities of the bi-gram and tri-gram models

We calculate the probabilities of each bi-gram and tri-gram, using the counts of the previous step.

If we calculate the probabilities of bi-gram, we normalize dividing by the count of the first word of the bi-gram.

If we calculate the probabilities of tri-gram, we normalize dividing by the count of the first two words of the tri-gram.

If we calculate the probabilities of the start and end of the sentences, we normalize dividing by the count of all the sentences (because we have one token start and one token end for each sentences).

In [5]:
bigram_prob = []
trigram_prob = []

for bigram in bigram_count.items():
    if bigram[0][0] == '<s>':
        bigram_prob.append((bigram[0], bigram[1] / len(df)))
    else:
        bigram_prob.append((bigram[0], bigram[1] / words_count[bigram[0][0]]))

for trigram in trigram_count.items():
    if trigram[0][0] == '<s>' and trigram[0][1] == '<s>':
        trigram_prob.append((trigram[0], trigram[1] / len(df)))
    else:
        trigram_prob.append((trigram[0], trigram[1] / bigram_count[trigram[0][:2]]))

### I use the two models to generate 50 tweets

First of all, we select the initial bigram with a random choice weighted by the probabilities of each bigram that is a start bigram.

Then, we select the next bigram with a random choice weighted (from a list of possible bigrams that has the start with the last word of previous bigram) by the probabilities of each bigram. Then we concatenate the last word of the bigram to the sentence.

We repeat this process until we find a bigram that has the end token as last word.

In [6]:
# I use the two models to generate 50 tweets
final_tweets = []

# Bigram model
for i in range(50):
    initial_bigram = random.choices([bigram[0] for bigram in bigram_prob if bigram[0][0] == '<s>'], [bigram[1] for bigram in bigram_prob if bigram[0][0] == '<s>'])[0]
    sentence = [initial_bigram]
    while sentence[len(sentence) - 1][1] != '</s>':
        possible_bigram = []
        for bigram in bigram_prob:
            if bigram[0][0] == sentence[len(sentence) - 1][1]:
                possible_bigram.append(bigram)
        sentence.append(random.choices(possible_bigram, [bigram[1] for bigram in possible_bigram])[0][0])
    final_tweets.append(sentence)

# Print the bigram model tweets
print("-------- Bigram model tweets --------")
for tweet in final_tweets:
    sentence = ""
    for word in tweet:
        if word[0] == '<s>':
            sentence += word[1] + " "
        elif word[1] != '</s>':
            sentence += word[1] + " "
    print(sentence)
print("-------- End of bigram model tweets --------")

final_tweets = []

# Trigram model
for i in range(50):
    initial_trigram = random.choices([trigram[0] for trigram in trigram_prob if trigram[0][0] == '<s>' and trigram[0][1] == '<s>'], [trigram[1] for trigram in trigram_prob if trigram[0][0] == '<s>' and trigram[0][1] == '<s>'])[0]
    sentence = [initial_trigram]
    while sentence[len(sentence) - 1][2] != '</s>':
        possible_trigram = []
        for trigram in trigram_prob:
            if trigram[0][0] == sentence[len(sentence) - 1][1] and trigram[0][1] == sentence[len(sentence) - 1][2]:
                possible_trigram.append(trigram)
        sentence.append(random.choices(possible_trigram, [trigram[1] for trigram in possible_trigram])[0][0])
    final_tweets.append(sentence)

# Print the trigram model tweets
print("-------- Trigram model tweets --------")
for tweet in final_tweets:
    sentence = ""
    for word in tweet:
        if word[1] != '<s>':
            sentence += word[1] + " "
        elif word[2] == '</s>':
            sentence += word[1]
    print(sentence)
print("-------- End of trigram model tweets --------")


-------- Bigram model tweets --------
....because they are w… 
The WWE thought it was sham." 
“Donald Trump: John McCain Is A total losers who I was forced to Houston - also in particular to extend my wildest thoughts! Thanks @JamersonHayes they know that--thanks. 
I went bankrupt (I didn't) say I do. Even the haters never have never attacked dopey Jon Stewart is how lucky they still succeed! 
"@Lumberportal: @realDonaldTrump @joooooojaah Shut up loser who sadly plays right into our rapidly rebuilding Military Vets (Choice!) & @FoxNews get it to The Don than that his own @DannyZuker !" He never give it will self-destruct just some losers I only get the haters losers! 
Hope everyone knew he is a loser! 
I don’t know I like to get rid of control their jobs http://t.co/W6d60Oiecc She's not choosing great nominee gas prices and losers) goes into our true loser. (Donald Trump)" 
"@rodmonium91: @realDonaldTrump The Corrupt News @CNN. A made up phony lawsuit against shooting massive doses int