# This is quite important for practice in ML/DL model building, always start from a sample set from the large data set.

In [44]:
import os
import math
import copy
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import jieba
from nltk import word_tokenize
from collections import Counter
from torch.autograd import Variable

import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
# NLTK resource https://www.nltk.org/data.html
# Download resources for part-of-speech tagging
nltk.download('averaged_perceptron_tagger')

# Download WordNet resources (for tasks like synonym extraction)
nltk.download('nltk_data/corpora/wordnet')

# Download Open Multilingual WordNet resource
nltk.download('nltk_data/corpora/omw-1.4')

# Download pre-trained model for sentence tokenization (especially for English)
nltk.download('punkt')


#################### Corpora ###############################
# Download the Brown Corpus
nltk.download('brown')

# Download a collection of English texts from Project Gutenberg
nltk.download('gutenberg')

# Download other corpora (replace names with desired ones)
nltk.download('shakespeare')
nltk.download('cmudict')
nltk.download('cess_cat')

############## Stop words ######################################
# Download stopwords for a specific language (replace 'english' with the code)
nltk.download('stopwords/english')

# Download stopwords for other languages (e.g., 'french', 'german')
nltk.download('stopwords/<language_name>')

############ Additional resource#################
# Download gazetteers (geographical name lists)
nltk.download('gazetteers')

# Download names (personal name lists)
nltk.download('names')

# Download data for Snowball stemmers
nltk.download('snowball_data')

# Download Wall Street Journal parsed corpus (for advanced tasks)
nltk.download('treebank')

# Download sample tweets from Twitter
nltk.download('twitter_samples')




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Error loading nltk_data/corpora/wordnet: Package
[nltk_data]     'nltk_data/corpora/wordnet' not found in index
[nltk_data] Error loading nltk_data/corpora/omw-1.4: Package
[nltk_data]     'nltk_data/corpora/omw-1.4' not found in index
[nltk_data] Downloading package punkt to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package shakespeare to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Unzipping corpora/shakespeare.zip.
[nltk_data] Downloadin

True

In [5]:
#  Init parameters

UNK = 0 # unknow word-id
PAD = 1 # padding word-id
BATCH_SIZE = 64

DEBUG = True
# DEBUG = False # model building, GPU CUDA is preferred

if DEBUG:
    EPOCHS = 2
    LAYERS = 3
    H_NUM = 8
    D_MODEL = 128
    D_FF = 256
    DROPOUT = 0.1
    MAX_LENGTH = 60
    TRAIN_FILE = "./data/nmt/en-cn/train_mini.txt"
    DEV_FILE = "./data/nmt/en-cn/dev_mini.txt"
    SAVE_FILE = "./save/models/model.pt"

else:
    EPOCHS = 20
    LAYERS = 6
    H_NUM = 8
    D_MODEL = 256
    D_FF = 1024
    DROPUT = .1
    MAX_LENGTH = 60
    TRAIN_FILE = "./data/nmt/en-cn/train.txt"
    DEV_FILE = "./data/nmt/en-cn/dev.txt"
    SAVE_FILE = "./save/models/large_model.pt"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preprocessing 
1. Load the sentence and tokenize the sentence and add start/end marks(Begin of Sentence /End of Sentence vs BOS/ EOS).
2. Build dictionaries including ‘word-to-id’ and inverted dictionary ‘id-to-word’: English and Chinese, ‘word: index}, i.e, {‘english’: 1234}, {1234: ‘english’}.
3. Sort the dictionaries to reduce padding.
4. Split the dataset into patches for training and validation.

In [6]:
def seq_padding(X, padding=0):
    """
    Add padding to a batch of data
    """
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([
            x, [padding] * (ML - len(x))
        ]) if len(x) < ML else x for x in X
    ])

In [8]:
class PrepareData:
    def __init__(self, train_file, dev_file):
        # 1. Read the data and tokenize
        self.train_en, self.train_cn = self.load_data(train_file)
        self.dev_en, self.dev_cn = self.load_data(dev_file)

        # 2. build dictionary: En and CN
        self.en_word_dict, self.en_total_words, self.en_index_dict = self.build_dict(self.train_en)
        self.cn_word_dict, self.cn_total_words, self.cn_index_dict = self.build_dict(self.train_cn)

        # 3. word to id by dictionary
        self.train_en, self.train_cn = self.wordToID(self.train_en, self.train_cn, 
                                                     self.en_word_dict, self.cn_word_dict)
        self.dev_en, self.dev_cn = self.wordToID(self.dev_en, self.dev_cn, 
                                                 self.en_word_dict, self.cn_word_dict)

        # 4. batch, padding, and masking
        self.train_data = self.splitBatch(self.train_en, self.train_cn, BATCH_SIZE)
        self.dev_data = self.splitBatch(self.dev_en, self.dev_cn, BATCH_SIZE)

    # Utility functions
    def load_data(self, path):
        """
        read data, tokenize the seence and add start and end marks(bos, eos)
        for example:
        en = [
            ["BOS", "i", "love", "you", "EOS"],
            ["BOS", "me", "too", "EOS"],
            ...
        ]
        cn = [
            ["BOS", "我", "爱", "你", "EOS"],
            ["BOS", "我", "也", ,"是", "EOS"],
            ...
        ]
        """
        en = []
        cn = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip().split("\t")
                en.append(["BOS"] + word_tokenize(line[0].lower()) + ["EOS"])
                cn.append(["BOS"] + word_tokenize())
        

In [40]:
test = ["I wonder if I hurt Tom's feelings.	我不知道我是不是伤害了汤姆的感情。", "Every one of her songs was a hit.	她的每首歌都长期备受欢迎。"]

In [46]:
test_en = []
test_cn = []
for i in test:
    i = i.strip().split("\t")
    print(i[1])
    test_en.append(["BOS"] + word_tokenize(i[0].lower()) + ["EOS"])
    test_cn.append(["BOS"] + jieba.cut(i[1]) + ["EOS"])

print(test_en)
print(test_cn)


我不知道我是不是伤害了汤姆的感情。
她的每首歌都长期备受欢迎。


In [None]:
seg_list = jieba.cut(test)

In [48]:
# Define a Chinese sentence
text = "我不知道我是不是伤害了汤姆的感情。"

# Segment the sentence using Jieba
seg_list = jieba.cut(text)

# Print the tokenized words (separated by spaces)
print(" ".join(seg_list))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.509 seconds.
Prefix dict has been built successfully.


我 不 知道 我 是不是 伤害 了 汤姆 的 感情 。
