# Colab

In [1]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [2]:
# %cd /content/drive/MyDrive/uni_bonn/nlp/final_project

# Install modules

In [3]:
!pip install datasets
!pip install 'transformers[torch]'
!pip install fasttext
!pip install PyDictionary
!pip install validators



# Import modules

In [4]:
from datasets import load_dataset
import spacy
import nltk
import string
import re
from PyDictionary import PyDictionary
from collections import Counter

english_vocab_2 = PyDictionary()
nltk.download('words')
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package words to /Users/sg/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [5]:
import time

In [6]:
import pickle

In [7]:
nlp = spacy.load("en_core_web_sm")

# Data Exploration

In [8]:
def add_to_dict(x, d):
  if x not in d:
    d[x] = 0
  d[x] += 1

In [9]:
def produce_stat_of_file(filepath):
  current_corpus_freq = dict()
  oov_vocab = set()
  num_of_token = 0
  num_of_line = 0
  num_of_oov_token = 0

  with open(filepath, 'r') as f:
    line = None
    while True:
      line = f.readline().rstrip()
      if not line:
        break
      num_of_line += 1

      doc = nlp(line)
      for token in doc:
        num_of_token += 1
        current_lemma = token.lemma_
        current_text = token.text.lower()

        add_to_dict(current_text, current_corpus_freq)

        if current_lemma.lower() not in english_vocab:
          oov_vocab.add(current_text)
          num_of_oov_token += 1

  print(filepath)
  print("Number of lines:", num_of_line)
  print("Number of tokens:", num_of_token)
  print("Average tokens per line", num_of_token / num_of_line)
  print("Number of out-of-vocabulary token", num_of_oov_token)
  print("\n")

  stats = dict(corpus_freq=current_corpus_freq, oov_vocab=oov_vocab, num_of_token=num_of_token, num_of_line=num_of_line, num_of_oov_token=num_of_oov_token)

  return stats

In [10]:
def produce_total_stats(list_of_stats_dict):
  current_corpus_freq = dict()
  oov_vocab = set()
  num_of_token = 0
  num_of_line = 0
  num_of_oov_token = 0

  for stats_dict in list_of_stats_dict:
    current_corpus_freq = dict(Counter(current_corpus_freq) + Counter(stats_dict['corpus_freq']))
    oov_vocab = oov_vocab.union(stats_dict['oov_vocab'])
    num_of_token += stats_dict['num_of_token']
    num_of_line += stats_dict['num_of_line']
    num_of_oov_token += stats_dict['num_of_oov_token']

  print("Number of lines:", num_of_line)
  print("Number of tokens:", num_of_token)
  print("Average tokens per line", num_of_token / num_of_line)
  print("Number of out-of-vocabulary token", num_of_oov_token)
  print("\n")

  stats = dict(corpus_freq=current_corpus_freq, oov_vocab=oov_vocab, num_of_token=num_of_token, num_of_line=num_of_line, num_of_oov_token=num_of_oov_token)
  return stats

## Formal domain (target)

In [11]:
fr_formal_train = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/train/formal")
fr_formal_tune_ref0 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/tune/formal.ref0")
fr_formal_tune_ref1 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/tune/formal.ref1")
fr_formal_tune_ref2 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/tune/formal.ref2")
fr_formal_tune_ref3 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/tune/formal.ref3")
fr_formal_test_ref0 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/test/formal.ref0")
fr_formal_test_ref1 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/test/formal.ref1")
fr_formal_test_ref2 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/test/formal.ref2")
fr_formal_test_ref3 = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/test/formal.ref3")

../GYAFC_Corpus/Family_Relationships/train/formal
Number of lines: 51967
Number of tokens: 681466
Average tokens per line 13.113437373717936
Number of out-of-vocabulary token 94472


../GYAFC_Corpus/Family_Relationships/tune/formal.ref0
Number of lines: 2788
Number of tokens: 34369
Average tokens per line 12.327474892395983
Number of out-of-vocabulary token 4861


../GYAFC_Corpus/Family_Relationships/tune/formal.ref1
Number of lines: 2788
Number of tokens: 34050
Average tokens per line 12.213055954088953
Number of out-of-vocabulary token 4777


../GYAFC_Corpus/Family_Relationships/tune/formal.ref2
Number of lines: 2788
Number of tokens: 34288
Average tokens per line 12.29842180774749
Number of out-of-vocabulary token 4855


../GYAFC_Corpus/Family_Relationships/tune/formal.ref3
Number of lines: 2788
Number of tokens: 34075
Average tokens per line 12.222022955523673
Number of out-of-vocabulary token 4718


../GYAFC_Corpus/Family_Relationships/test/formal.ref0
Number of lines: 1332
Number

In [12]:
em_formal_train = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/train/formal")
em_formal_tune_ref0 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/tune/formal.ref0")
em_formal_tune_ref1 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/tune/formal.ref1")
em_formal_tune_ref2 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/tune/formal.ref2")
em_formal_tune_ref3 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/tune/formal.ref3")
em_formal_test_ref0 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/test/formal.ref0")
em_formal_test_ref1 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/test/formal.ref1")
em_formal_test_ref2 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/test/formal.ref2")
em_formal_test_ref3 = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/test/formal.ref3")

../GYAFC_Corpus/Entertainment_Music/train/formal
Number of lines: 52595
Number of tokens: 664090
Average tokens per line 12.626485407358114
Number of out-of-vocabulary token 125525


../GYAFC_Corpus/Entertainment_Music/tune/formal.ref0
Number of lines: 2877
Number of tokens: 36262
Average tokens per line 12.604101494612443
Number of out-of-vocabulary token 6882


../GYAFC_Corpus/Entertainment_Music/tune/formal.ref1
Number of lines: 2877
Number of tokens: 36139
Average tokens per line 12.561348627042058
Number of out-of-vocabulary token 6861


../GYAFC_Corpus/Entertainment_Music/tune/formal.ref2
Number of lines: 2877
Number of tokens: 36129
Average tokens per line 12.557872784150156
Number of out-of-vocabulary token 6814


../GYAFC_Corpus/Entertainment_Music/tune/formal.ref3
Number of lines: 2877
Number of tokens: 36317
Average tokens per line 12.6232186305179
Number of out-of-vocabulary token 6841


../GYAFC_Corpus/Entertainment_Music/test/formal.ref0
Number of lines: 1416
Number of to

In [13]:
fr_formal_stats = produce_total_stats([
    fr_formal_train,
    fr_formal_tune_ref0,
    fr_formal_tune_ref1,
    fr_formal_tune_ref2,
    fr_formal_tune_ref3,
    fr_formal_test_ref0,
    fr_formal_test_ref1,
    fr_formal_test_ref2,
    fr_formal_test_ref3,
])

Number of lines: 68447
Number of tokens: 887437
Average tokens per line 12.965316230075825
Number of out-of-vocabulary token 123636




In [14]:
em_formal_stats = produce_total_stats([
    em_formal_train,
    em_formal_tune_ref0,
    em_formal_tune_ref1,
    em_formal_tune_ref2,
    em_formal_tune_ref3,
    em_formal_test_ref0,
    em_formal_test_ref1,
    em_formal_test_ref2,
    em_formal_test_ref3,
])

Number of lines: 69767
Number of tokens: 881052
Average tokens per line 12.62849198044921
Number of out-of-vocabulary token 166438




In [15]:
formal_stats = produce_total_stats([
    em_formal_stats,
    fr_formal_stats
])

Number of lines: 138214
Number of tokens: 1768489
Average tokens per line 12.795295700869666
Number of out-of-vocabulary token 290074




In [16]:
stats = dict(
    formal_stats=formal_stats,
    em_formal_stats=em_formal_stats,
    fr_formal_stats=fr_formal_stats,
    fr_formal_train=fr_formal_train,
    fr_formal_tune_ref0=fr_formal_tune_ref0,
    fr_formal_tune_ref1=fr_formal_tune_ref1,
    fr_formal_tune_ref2=fr_formal_tune_ref2,
    fr_formal_tune_ref3=fr_formal_tune_ref3,
    fr_formal_test_ref0=fr_formal_test_ref0,
    fr_formal_test_ref1=fr_formal_test_ref1,
    fr_formal_test_ref2=fr_formal_test_ref2,
    fr_formal_test_ref3=fr_formal_test_ref3,
    em_formal_train=em_formal_train,
    em_formal_tune_ref0=em_formal_tune_ref0,
    em_formal_tune_ref1=em_formal_tune_ref1,
    em_formal_tune_ref2=em_formal_tune_ref2,
    em_formal_tune_ref3=em_formal_tune_ref3,
    em_formal_test_ref0=em_formal_test_ref0,
    em_formal_test_ref1=em_formal_test_ref1,
    em_formal_test_ref2=em_formal_test_ref2,
    em_formal_test_ref3=em_formal_test_ref3,
)

In [17]:
with open("./formal_stats.pkl", 'wb') as file:
    pickle.dump(stats, file)

## Informal domain (source)

In [18]:
fr_informal_train = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/train/informal")
fr_informal_tune = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/tune/informal")
fr_informal_test = produce_stat_of_file("../GYAFC_Corpus/Family_Relationships/test/informal")

../GYAFC_Corpus/Family_Relationships/train/informal
Number of lines: 51967
Number of tokens: 656066
Average tokens per line 12.624665653202994
Number of out-of-vocabulary token 106864


../GYAFC_Corpus/Family_Relationships/tune/informal
Number of lines: 2788
Number of tokens: 34191
Average tokens per line 12.263629842180775
Number of out-of-vocabulary token 5511


../GYAFC_Corpus/Family_Relationships/test/informal
Number of lines: 1332
Number of tokens: 16951
Average tokens per line 12.725975975975976
Number of out-of-vocabulary token 2727




In [19]:
em_informal_train = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/train/informal")
em_informal_tune = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/tune/informal")
em_informal_test = produce_stat_of_file("../GYAFC_Corpus/Entertainment_Music/test/informal")

../GYAFC_Corpus/Entertainment_Music/train/informal
Number of lines: 52595
Number of tokens: 638538
Average tokens per line 12.14065975853218
Number of out-of-vocabulary token 145564


../GYAFC_Corpus/Entertainment_Music/tune/informal
Number of lines: 2877
Number of tokens: 34862
Average tokens per line 12.117483489746263
Number of out-of-vocabulary token 7294


../GYAFC_Corpus/Entertainment_Music/test/informal
Number of lines: 1416
Number of tokens: 17447
Average tokens per line 12.32132768361582
Number of out-of-vocabulary token 3675




In [20]:
fr_informal_stats = produce_total_stats([
    fr_informal_train,
    fr_informal_tune,
    fr_informal_test,
])

Number of lines: 56087
Number of tokens: 707208
Average tokens per line 12.609125109205342
Number of out-of-vocabulary token 115102




In [21]:
em_informal_stats = produce_total_stats([
    em_informal_train,
    em_informal_tune,
    em_informal_test,
])

Number of lines: 56888
Number of tokens: 690847
Average tokens per line 12.143984671635494
Number of out-of-vocabulary token 156533




In [22]:
informal_stats = produce_total_stats([
    em_informal_stats,
    fr_informal_stats
])

Number of lines: 112975
Number of tokens: 1398055
Average tokens per line 12.37490595264439
Number of out-of-vocabulary token 271635




In [23]:
stats = dict(
    informal_stats=informal_stats,
    em_informal_stats=em_informal_stats,
    fr_informal_stats=fr_informal_stats,
    fr_informal_train=fr_informal_train,
    fr_informal_tune=fr_informal_tune,
    fr_informal_test=fr_informal_test,
    em_informal_train=em_informal_train,
    em_informal_tune=em_informal_tune,
    em_informal_test=em_informal_test,
)

In [24]:
with open("./informal_stats.pkl", 'wb') as file:
    pickle.dump(stats, file)

In [25]:
with open("./informal_stats.pkl", "rb") as file:
    test = pickle.load(file)

In [26]:
print(test['informal_stats'])



# Play with dataset

In [27]:
cnn_dailymail = load_dataset('cnn_dailymail', '3.0.0')

In [28]:
wmt14 = load_dataset('wmt14', 'de-en')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [29]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [30]:
text = wmt14['train'][20]['translation']['en']

In [31]:
text = r"Did you meet my parents?"

In [32]:
print(text)

Did you meet my parents?


In [33]:
text = "he spends 40$ on food"

In [34]:
doc = nlp(text)
for token in doc:
    print(token.lemma_)

he
spend
40
$
on
food


In [35]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained("bert-base-cased")

In [36]:
output = tokenizer.encode("Yoooooooo! What's up?")
print(output.tokens)

['[CLS]', 'Yo', '##oo', '##oo', '##oo', '##o', '!', 'What', "'", 's', 'up', '?', '[SEP]']


In [37]:
!pip install nltk



In [38]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [40]:
lemmatizer.lemmatize(r"gooddddddd")

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/Users/sg/nltk_data'
    - '/Users/sg/anaconda3/envs/myenv/nltk_data'
    - '/Users/sg/anaconda3/envs/myenv/share/nltk_data'
    - '/Users/sg/anaconda3/envs/myenv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
lemmatizer = nlp.get_pipe("lemmatizer")

In [None]:
doc = nlp("DONT FORGET TO CHOOSE AND VOTE FOR BEST ANSWER!!!")
print([token.lemma_ for token in doc])

In [None]:
nltk.download('words')

In [None]:
import nltk
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

In [None]:
"fond" in english_vocab

# Word embedding

In [None]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

In [None]:
# ft.get_word_vector("n't")
ft.get_nearest_neighbors(r"hooot", k=100)

In [None]:
import numpy as np

In [None]:
v1 = ft.get_word_vector(r"hot")
v2 = ft.get_word_vector(r"hoooot")

print(np.linalg.norm(v1 - v2, ord=2))

In [None]:
import gc

In [None]:
gc.collect()

In [None]:
del ft

In [None]:
!gdown https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

In [None]:
!pip install gensim

In [None]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
glove_vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
from gensim.models import KeyedVectors

word2vec_path = '/content/drive/MyDrive/uni_bonn/nlp/final_project/GoogleNews-vectors-negative300.bin.gz'
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
#w2v_model.get_vector(r"n't")
w2v_model.most_similar(r"obama", topn=50)

In [None]:
a = ""
print(type(a))