In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**N-grams Corpus Preprocessing**

In [2]:
import nltk
import re
from nltk.tokenize import word_tokenize

In [3]:
corpus = "Learning% makes 'me' happy. I am happy be-cause I am learning! :)"
corpus = corpus.lower()
corpus

"learning% makes 'me' happy. i am happy be-cause i am learning! :)"

In [4]:
corpus = re.sub(r"[^a-zA-Z0-9.?! ]+","",corpus)
corpus

'learning makes me happy. i am happy because i am learning! '

In [5]:
ip_date = "Sat May  9 07:33:35 CEST 2020"
date_parts = ip_date.split(" ")
time_parts = date_parts[4].split(":")
print(date_parts)
time_parts


['Sat', 'May', '', '9', '07:33:35', 'CEST', '2020']


['07', '33', '35']

In [6]:
sentence = 'i am happy because i am learning.'
tokenized = nltk.word_tokenize(sentence)
tokenized

['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

In [7]:
word_lens = [len(word) for word in tokenized]
word_lens

[1, 2, 5, 7, 1, 2, 8, 1]

In [8]:
def trigram(tokenized):
    trigrams = [tokenized[i:i+3] for i in range(len(tokenized)-2)]
    return trigrams
trigrams = trigram(tokenized)
trigrams

[['i', 'am', 'happy'],
 ['am', 'happy', 'because'],
 ['happy', 'because', 'i'],
 ['because', 'i', 'am'],
 ['i', 'am', 'learning'],
 ['am', 'learning', '.']]

In [9]:
from nltk.util import trigrams
trigramss = list(trigrams(tokenized))
trigramss

[('i', 'am', 'happy'),
 ('am', 'happy', 'because'),
 ('happy', 'because', 'i'),
 ('because', 'i', 'am'),
 ('i', 'am', 'learning'),
 ('am', 'learning', '.')]

In [10]:
fourgram = ['i', 'am', 'happy','because']
trigrams = fourgram[:-1]
trigrams

['i', 'am', 'happy']

In [11]:
n=3
tokenized
tokenized = ["<s>"]*(n-1) + tokenized + ["</s>"]
tokenized

['<s>',
 '<s>',
 'i',
 'am',
 'happy',
 'because',
 'i',
 'am',
 'learning',
 '.',
 '</s>']

**Building the language model**

In [12]:
prefix = ('i', 'am', 'happy')
word = 'because'
n_gram = prefix+(word,)
n_gram

('i', 'am', 'happy', 'because')

In [13]:
from collections import defaultdict
from nltk.util import trigrams,bigrams
import pandas as pd
def singpass_trigram_countmat(corpus):
    d = defaultdict(float)
    bigramss = list(bigrams(corpus))
    bigramss = bigramss[:-1]
    trigramss = list(trigrams(corpus))

    for (w1,w2) in set(bigramss):
        for j in range(2,len(corpus)):
            w3 = corpus[j]
            if (w1,w2,w3) in trigramss:
                d[((w1,w2),w3)] += 1
            
            else:
                d[((w1,w2),w3)] = 0
    count_matrix = pd.DataFrame(
    {(w1, w2): {w3: d[((w1, w2), w3)] for w3 in corpus[2:]} for (w1, w2) in set(bigramss)}).T 
    return count_matrix

In [14]:
from collections import defaultdict
from nltk.util import trigrams,bigrams
import pandas as pd

def singpass_trigram_countmat(corpus):
    bigrams = []
    vocabulary = []
    count_mat_dict = defaultdict(dict)

    for i in range(len(corpus)-3+1):
        trigram = tuple(corpus[i:i+3])
        bigram = trigram[:2]

        if not bigram in bigrams:
            bigrams.append(bigram)

        last_word = trigram[-1]
        if not last_word in vocabulary:
            vocabulary.append(last_word)

        if (bigram,last_word) not in count_mat_dict:
            count_mat_dict[bigram,last_word] = 0

        count_mat_dict[bigram,last_word]+=1

    count_mat = np.zeros((len(bigrams), len(vocabulary)))
    for trigram_key, trigram_count in count_mat_dict.items():
        count_mat[bigrams.index(trigram_key[0]), vocabulary.index(trigram_key[1])] = trigram_count

    count_mat = pd.DataFrame(count_mat, index=bigrams, columns=vocabulary)
    return bigrams, vocabulary, count_mat    

In [15]:

corpus = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
bigrams,vocabulary, count_mat = singpass_trigram_countmat(corpus)
count_mat

Unnamed: 0,happy,because,i,am,learning,.
"(i, am)",1.0,0.0,0.0,0.0,1.0,0.0
"(am, happy)",0.0,1.0,0.0,0.0,0.0,0.0
"(happy, because)",0.0,0.0,1.0,0.0,0.0,0.0
"(because, i)",0.0,0.0,0.0,1.0,0.0,0.0
"(am, learning)",0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
row_sums = count_mat.sum(axis=1)
prob_mat = count_mat.div(row_sums, axis=0)
prob_mat

Unnamed: 0,happy,because,i,am,learning,.
"(i, am)",0.5,0.0,0.0,0.0,0.5,0.0
"(am, happy)",0.0,1.0,0.0,0.0,0.0,0.0
"(happy, because)",0.0,0.0,1.0,0.0,0.0,0.0
"(because, i)",0.0,0.0,0.0,1.0,0.0,0.0
"(am, learning)",0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
trigram = ('i', 'am', 'happy')
bigram = trigram[:-1]
word = trigram[-1]
tri_prob = prob_mat[word][bigram]
print("Trigram Probability: ",tri_prob)

Trigram Probability:  0.5


In [18]:
vocabulary = ['i', 'am', 'happy', 'because', 'learning', '.', 'have', 'you', 'seen','it', '?']
starts_with = 'ha'
for w in vocabulary:
    if w.startswith(starts_with):
        print(w)

happy
have


In [19]:
import random

def train_val_test_split(data,train_per,val_per):
    random.seed(87)
    random.shuffle(data)

    l=len(data)
    train_len = int((l*train_per)/100)
    val_len = int((l*val_per)/100)

    train_data = data[:train_len]
    val_data = data[train_len:train_len+val_len]
    test_data = data[train_len+val_len:]

    return train_data,val_data,test_data

In [20]:
data = [x for x in range(100)]
train_data,val_data,test_data = train_val_test_split(data,80,10)
# train_data