<a href="https://colab.research.google.com/github/mjag7682/CS9-1-NLP-for-Twitter-Data-for-predicting-stocks/blob/FinALBERT/FinALBERT_Pretraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setting GPU

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Load Reuters Data

In [None]:
from bs4 import BeautifulSoup
import os
import re
from nltk.util import ngrams

import warnings
warnings.filterwarnings('ignore')

In [None]:
documents = []
for file in os.listdir("/content/drive/My Drive/ALBERT/Reuters_Dataset"):
    
    if file.endswith(".sgm"):
        
        # for each sgm file, read it
        filename = os.path.join("/content/drive/My Drive/ALBERT/Reuters_Dataset", file)
        f = open(filename, 'r', encoding='utf-8', errors='ignore')
        dataFile = f.read()
        
        # pass it to BeautifulSoup
        soup = BeautifulSoup(dataFile, 'html.parser')
        contents = soup.findAll('body')
        
        # for each body tag, extract it's text
        for content in contents:
            documents.append(content.text)
print('We have {} documents'.format(len(documents)))
print(documents[0])

We have 19043 documents
Showers continued throughout the week in
the Bahia cocoa zone, alleviating the drought since early
January and improving prospects for the coming temporao,
although normal humidity levels have not been restored,
Comissaria Smith said in its weekly review.
    The dry period means the temporao will be late this year.
    Arrivals for the week ended February 22 were 155,221 bags
of 60 kilos making a cumulative total for the season of 5.93
mln against 5.81 at the same stage last year. Again it seems
that cocoa delivered earlier on consignment was included in the
arrivals figures.
    Comissaria Smith said there is still some doubt as to how
much old crop cocoa is still available as harvesting has
practically come to an end. With total Bahia crop estimates
around 6.4 mln bags and sales standing at almost 6.2 mln there
are a few hundred thousand bags still in the hands of farmers,
middlemen, exporters and processors.
    There are doubts as to how much of this cocoa 

###Pre-process

In [None]:
def removeLinks(text):
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    return text

documents = list(map(removeLinks, documents))

In [None]:
def removeHTMLTags(text):
    text = re.sub(r'<.*?>', '', text, flags=re.MULTILINE)
    return text

documents = list(map(removeHTMLTags, documents))

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

documents = list(map(decontracted, documents))

In [None]:
def convertToLowerCase(text):
    return text.lower()

documents = list(map(convertToLowerCase, documents))

In [None]:
def removeWordsWithNumbers(text):
    return re.sub(r"\S*\d\S*", "", text).strip()

documents = list(map(removeWordsWithNumbers, documents))

In [None]:
documents[:5]

['showers continued throughout the week in\nthe bahia cocoa zone, alleviating the drought since early\njanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\ncomissaria smith said in its weekly review.\n    the dry period means the temporao will be late this year.\n    arrivals for the week ended february  were  bags\nof  kilos making a cumulative total for the season of \nmln against  at the same stage last year. again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    comissaria smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. with total bahia crop estimates\naround  mln bags and sales standing at almost  mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    there are doubts as to how much of this cocoa would be fit\nfor export as sh

In [None]:
def removePuctuations(text):
    return re.sub(r'[^A-Za-z0-9]+', ' ', text)

documents = list(map(removePuctuations, documents))

def removeWhiteSpaces(text):
    return text.strip()

documents = list(map(removeWhiteSpaces, documents))


In [None]:
documents[0:10]

['showers continued throughout the week in the bahia cocoa zone alleviating the drought since early january and improving prospects for the coming temporao although normal humidity levels have not been restored comissaria smith said in its weekly review the dry period means the temporao will be late this year arrivals for the week ended february were bags of kilos making a cumulative total for the season of mln against at the same stage last year again it seems that cocoa delivered earlier on consignment was included in the arrivals figures comissaria smith said there is still some doubt as to how much old crop cocoa is still available as harvesting has practically come to an end with total bahia crop estimates around mln bags and sales standing at almost mln there are a few hundred thousand bags still in the hands of farmers middlemen exporters and processors there are doubts as to how much of this cocoa would be fit for export as shippers are now experiencing dificulties in obtaining

##Build Vocab

##Create file for vocab

In [None]:
# f = open("/content/drive/My Drive/Reuters_Dataset/Reuters_model/reuters_docs.txt", "x")
f = open("/content/drive/My Drive/Reuters_Dataset/Reuters_model/reuters_docs.txt", "w")
for doc in documents:
  f.write(doc)
  f.write('\n')
  f.write('\n')

In [None]:
!pip install tensorflow==1.15.0

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
id = '1-5iUm7evh54KFjqbNGJ-E5uclVInGOhK'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('reuters_docs.txt') 

In [None]:
!pip install sentencepiece
!git clone https://github.com/google-research/ALBERT


In [None]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train('--input=''reuters_docs.txt'' --vocab_size=20000 --model_prefix=30k-clean --pad_id=0 --unk_id=1 --pad_piece=<pad> --unk_piece=<unk> --bos_id=-1 --eos_id=-1 --control_symbols=[CLS],[SEP],[MASK],[UNK],<pad> --user_defined_symbols="(,),",-,.,–,£,€"')

In [None]:
sp_user = spm.SentencePieceProcessor()
sp_user.load('30k-clean.model')
vocabs = [sp_user.IdToPiece(id) for id in range(sp_user.GetPieceSize())]

In [None]:
print(vocabs)



In [None]:
# f = open("/content/drive/My Drive/Reuters_Dataset/Reuters_model/model-fine/vocab.txt", "x")
f = open("/content/drive/My Drive/Reuters_Dataset/Reuters_model/model-fine/vocab.txt", "w")
for vocab in vocabs:
  f.write(vocab)
  f.write('\n')

In [None]:
sp_user.EncodeAsIds('Hello world')

[20, 1, 14793, 171]

In [None]:
!pip3 install albert-tensorflow

Collecting albert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/ba/1e/e776bb23e6f89a1f1d7d33b50d0bd9c2c7b24b39aa548f041827a9c00d73/albert_tensorflow-1.1-py3-none-any.whl (81kB)
[K     |████                            | 10kB 14.7MB/s eta 0:00:01[K     |████████                        | 20kB 4.8MB/s eta 0:00:01[K     |████████████                    | 30kB 5.6MB/s eta 0:00:01[K     |████████████████                | 40kB 6.0MB/s eta 0:00:01[K     |████████████████████            | 51kB 5.3MB/s eta 0:00:01[K     |████████████████████████        | 61kB 5.7MB/s eta 0:00:01[K     |████████████████████████████▏   | 71kB 6.2MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.1MB/s 
Installing collected packages: albert-tensorflow
Successfully installed albert-tensorflow-1.1


In [None]:
!pip install -r /content/ALBERT/requirements.txt

In [None]:
cd ./ALBERT

/content/ALBERT


##Create Pre-training Data

In [None]:
!python create_pretraining_data.py --input_file "/content/drive/My Drive/ALBERT/Reuters_Dataset/reuters_docs.txt" --output_file "/content/drive/My Drive/ALBERT/Reuters_Dataset/reuters_docs" --vocab_file "/content/drive/My Drive/ALBERT/Reuters_Dataset/vocab.txt" --max_seq_length=64

In [None]:
!pip install transformers
!pip install tfrecord

##Run Pre-training

In [None]:
!python run_pretraining.py \
    --input_file="/content/drive/My Drive/Reuters_Dataset/reuters_docs" \
    --output_dir="/content/drive/My Drive/Reuters_Dataset/Reuters_model" \
    --albert_config_file="/content/drive/My Drive/ALBERT/Reuters_Dataset/Reuters_model/albert_config.json" \
    --do_train \
    --do_eval \
    --train_batch_size=128 \
    --eval_batch_size=64 \
    --max_seq_length=64 \
    --max_predictions_per_seq=20 \
    --optimizer='lamb' \
    --learning_rate=.00176 \
    --num_train_steps=300 \
    --num_warmup_steps=100 \
    --save_checkpoints_steps=50



W1013 12:13:41.674993 139711771125632 module_wrapper.py:139] From /usr/local/lib/python3.6/dist-packages/albert/modeling.py:116: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

INFO:tensorflow:*** Input Files ***
I1013 12:13:42.242640 139711771125632 run_pretraining.py:484] *** Input Files ***
INFO:tensorflow:  /content/drive/My Drive/Reuters_Dataset/reuters_docs
I1013 12:13:42.242915 139711771125632 run_pretraining.py:486]   /content/drive/My Drive/Reuters_Dataset/reuters_docs
W1013 12:13:42.243470 139711771125632 estimator.py:1994] Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7f10f52e7488>) includes params argument, but params are not passed to Estimator.
INFO:tensorflow:Using config: {'_model_dir': '/content/drive/My Drive/Reuters_Dataset/Reuters_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 50, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_optio

In [None]:
!pip install transformers
!pip install --upgrade tensorflow
!pip install torch

##Make Bin File

In [None]:
import logging
import torch
# from transformers import AlbertConfig, AlbertModel
from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert


logging.basicConfig(level=logging.INFO)


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)



convert_tf_checkpoint_to_pytorch("/content/drive/My Drive/Reuters_Dataset/Reuters_model/model.ckpt-best.index","/content/drive/My Drive/Reuters_Dataset/Reuters_model/albert_config.json" , "/content/drive/My Drive/Reuters_Dataset/Reuters_model/model-fine/pytorch_model.bin")

Building PyTorch model from configuration: AlbertConfig {
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 20000
}

Save PyTorch model to /content/drive/My Drive/Reuters_Dataset/Reuters_model/model-fine/pytorch_model.bin
