In [1]:
import numpy as np
import pandas as pd

import re
import pickle
import random
from tqdm import tqdm_notebook

import torch
from torch import nn
import torch.nn.functional as F

In [2]:
# reproducing same results
SEED = 42
torch.manual_seed(SEED)

<torch._C.Generator at 0x18d92ad3c90>

# Load Dataset

In [5]:
with open("wikitext-2/wiki-train.txt", "r") as f:
    text = f.readlines()

In [6]:
len(text)

36718

In [8]:
random.sample(text, 10)

[' = = = Publication of Max and Moritz = = = \n',
 ' \n',
 ' \n',
 ' After the Early Neolithic , the long barrow fell into a state of ruined <unk> , perhaps experiencing deliberate deposition in the late medieval period , either by Christian <unk> or treasure hunters . Local folklore grew up around the site , associating it with the burial of a prince and the countless stones motif . The ruin attracted the interest of <unk> in the 19th century , while archaeological excavation took place in the early 20th . After limited reconstruction , in 1926 ownership was transferred to heritage charity The National Trust . It is open without charge to visitors all year around . \n',
 ' Changes from the standard Tu @-@ 2 were <unk> to speed production and they consisted of the following : \n',
 ' \n',
 ' The book has been reviewed in USA Today , Business Week , Energy Priorities , Sustainability Investment News and several other magazines , and has been translated into seven languages . Clean Tech 

# Preprocessing Data
## Text Cleaning

In [9]:
# text cleaning
text_clean = []

for line in text:
    # remove everything except alphabets
    line = re.sub("[^a-zA-Z' ]", "", line)
    # convert text to lowercase
    line = line.lower()
    # add cleaned text to the list
    text_clean.append(line)

In [13]:
random.sample(text_clean, 10)

[' ',
 " creator matthew weiner said  far away places  was inspired by  unk french films  with  lots of short stories in them   with all three short stories linked by a thematic  desire to go away   he further explained that  peggy has this moment where she tries to be don and fails and then goes on peggy 's version of don  sexually irresponsible  and drunk  and working   elisabeth moss said the unk peggy gives a stranger in the theatre is a  moment of forgetting  after the frustrating heinz pitch  ",
 ' in mozambique  workers assisted people in moving to higher grounds following flooding  members of the mozambique red cross helped distribute food and clothing to the affected residents  and planes helped drop off supplies to residents in isolated areas  ',
 " in the th century this image begun to unk  as a new wave in historiography begun to unk his life  and as the era of unk in poland put more value on builders  and less on warriors  further  at that time the polish historians begun 

## Find word counts

In [14]:
# get list of all the words
all_words = " ".join(text_clean).split()

words_dict = {}

# add word-count pair to the dictionary
for word in all_words:
    words_dict[word] = words_dict.get(word, 0) + 1

In [15]:
# prepare a dataframe
words_df = pd.DataFrame({'word':list(words_dict.keys()), 'count':list(words_dict.values())})

# sort words by their count in increasing order
words_df = words_df.sort_values(by='count')

# reset dataframe index
words_df.reset_index(inplace=True, drop=True)

In [16]:
# vocabulary size
len(words_df)

27306

In [20]:
words_df.head()

Unnamed: 0,word,count
0,gallinae,3
1,cabins,3
2,baroness,3
3,genoa,3
4,spotting,3


In [18]:
words_df.tail()

Unnamed: 0,word,count
27301,in,44982
27302,and,50735
27303,unk,54625
27304,of,57030
27305,the,130768


In [None]:
# user specified threshold value
rare_thresh = 4

# get percentage of rare words in the vocabulary
rare_words_count = len(words_df[words_df['count'] < rare_thresh]['word'])
total_words = len(words_df) 
rare_dist = rare_words_count / total_words

# coverage percentage of rare words in the corpus
rare_cover = words_df[words_df['count'] < rare_thresh]['count'].sum()/words_df['count'].sum()