# Language Modeling
## Principal Assignment

Students:
- Victor Xesús Barreiro Domínguez
- Maximiliano Manuel Hormazábal Lagos

In [206]:
# Read data

import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from nltk.util import ngrams
from collections import Counter


df_train = pd.read_fwf("./HerMajestySpeechesDataset/train.txt",delimiter='\n',header=None,names=["text"])
df_dev = pd.read_fwf("./HerMajestySpeechesDataset/dev.txt",delimiter='\n',header=None,names=["text"])

In [208]:
df_train.iloc[3][0]

'The birth of a baby brings great happiness - but then the business of growing up begins .'

In [189]:
tokenizer.sequences_to_texts([[1713]])

['begins.']

In [184]:
tokenizer.texts_to_sequences([df_train.iloc[3][0]])

[[18, 428, 2, 6, 1401, 776, 53, 641, 54, 247, 1, 373, 2, 642, 216, 1713]]

# Frequencies of n-gram and next-word

We have a .txt file that contains one text per line. The following python code train a Keras Tokenizer with the train file and tokenize every single line in order to have numbers instead of words. With the tokenized dataset we created subsets of n+1 words chosing a n-gram size "n" to predict the "next_word" after this n-gram tupple. Finally the frequency dataset has the count of each n-gram tupple depending of the next_word.

In [220]:
def trainTokenizer(df, use_dot = False):
    if use_dot:
        df['text'] = list(map(lambda x: x.replace(".", " ."),df['text']))
        tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n',lower=False)
    else:
        tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['text'])

    sequences = tokenizer.texts_to_sequences(df['text'])
    return (tokenizer,sequences)

def countNGrams(ngram_size,sequences):
    n_grams = []
    for sequence in sequences:
        for i in range(ngram_size, len(sequence)):
            n_gram = tuple(sequence[i-ngram_size:i])
            n_grams.append(n_gram)

    n_gram_counts = Counter(n_grams)

    data = []
    for n_gram, count in n_gram_counts.items():
        data.append(((n_gram[:-1]),n_gram[-1],count))

    df_freq = pd.DataFrame(data, columns=['context', 'next_word', 'freq'])
    df_freq['total_freq'] = df_freq.groupby('context')['freq'].transform('sum')
    df_freq['prob'] = df_freq['freq'] / df_freq['total_freq']
    return df_freq[['context','next_word','prob']]

In [221]:
tokenizer,train_sequences = trainTokenizer(df_train)
ngram_size = 3
df_freq = countNGrams(ngram_size,train_sequences)
df_freq

Unnamed: 0,context,next_word,prob
0,"(1, 176)",993,0.028571
1,"(176, 993)",4,1.000000
2,"(993, 4)",38,1.000000
3,"(4, 38)",580,0.055556
4,"(38, 580)",40,1.000000
...,...,...,...
37207,"(685, 49)",5605,0.250000
37208,"(49, 5605)",20,1.000000
37209,"(5605, 20)",1,1.000000
37210,"(20, 1)",526,0.013889


In [222]:
tokenizer,train_sequences = trainTokenizer(df_train,True)
ngram_size = 3
df_freq = countNGrams(ngram_size,train_sequences)
df_freq

Unnamed: 0,context,next_word,prob
0,"(19, 176)",1023,0.029412
1,"(176, 1023)",5,1.000000
2,"(1023, 5)",40,1.000000
3,"(5, 40)",663,0.055556
4,"(40, 663)",41,1.000000
...,...,...,...
39308,"(49, 6132)",23,1.000000
39309,"(6132, 23)",1,1.000000
39310,"(23, 1)",541,0.013514
39311,"(1, 541)",2,1.000000


In [217]:
df_freq.loc[df_freq['next_word']==341]

Unnamed: 0,context,next_word,prob
2768,"(30, 1)",341,0.128571
12269,"(5, 1)",341,0.004505
14530,"(123, 30)",341,1.0
22959,"(6, 1)",341,0.011194
27738,"(6, 13)",341,0.023256
31139,"(1, 108)",341,0.153846
37445,"(12, 215)",341,0.055556
37701,"(1383, 129)",341,1.0


In [166]:
tokenizer.texts_to_sequences(['.'])

[[341]]

In [170]:
tokenizer.word_index['.']

341