# Google Drive code

In [0]:
from google.colab import drive

drive.mount('/content/drive')

In [0]:
# These variables define the path and filenames

root = "/content/drive/My Drive" # Do not edit this one!
path = "/Shared/FYP/Data"
filename = "/urdu_fixed.txt"

# Loading and processing the lines

In [0]:
def get_lines(root, path, filename):
    """ Returns processed lines from the file at the given path"""
    lines = []

    with open(root + path + filename, "rb") as text_file:
        lines.extend(text_file.readlines())
        text_file.close()
        
    lines = [x.decode("utf-8", "ignore") for x in lines]
    lines = [i.strip() for i in lines]
    lines = [i.split() for i in lines]

    return lines

lines = get_lines(root, path, filename)

# Training a word2vec model

## Usage
Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
         sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1,
         hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000,
         compute_loss=False, callbacks=(), max_final_vocab=None)

## Important parameters
1. **min_count** = int - Ignores all words with total absolute frequency lower than this - (2, 100)
2. **window** = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)
3. **size** = int - Dimensionality of the feature vectors. - (50, 300)
4. **sample** = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
5. **alpha** = float - The initial learning rate - (0.01, 0.05)
6. **min_alpha** = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
7.  **negative** = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
8. **workers** = int - Use these many worker threads to train the model (= faster training with multicore machines)


In [0]:
from gensim.models import Word2Vec

model = Word2Vec(lines, sg=1)

# Saving the model

In [0]:
model_name = 'word2vec_urdu_sg'
model.save(model_name)

# Downloading the saved model

In [0]:
from google.colab import files

try:
    files.download(model_name)
except ConnectionResetError:
    print("Encountered ConnectionResetError! The file download is incomplete")
except TypeError:
    print("Encountered TypeError!")
except NameError:
    print("Encountered NameError!")