In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!cp -r "/content/gdrive/My Drive/eKantipurCSV.zip" .

In [0]:
!unzip -q /content/eKantipurCSV.zip

In [0]:
import pandas as pd
import glob
import string
import re

In [0]:
files = glob.glob('/content/CSV/*')

In [0]:
files

['/content/CSV/2018-01-01 to 2018-12-31.csv',
 '/content/CSV/2017-12-01 to 2017-12-31.csv',
 '/content/CSV/2017-07-01 to 2017-07-31.csv',
 '/content/CSV/2017-06-01 to 2017-06-30.csv',
 '/content/CSV/2017-02-01 to 2017-03-30.csv',
 '/content/CSV/2017-09-01 to 2017-09-30.csv',
 '/content/CSV/2017-11-01 to 2017-11-30.csv',
 '/content/CSV/2017-01-01 to 2017-01-31.csv',
 '/content/CSV/2017-08-01 to 2017-08-31.csv',
 '/content/CSV/2019-07-01 to 2019-09-30.csv',
 '/content/CSV/2019-04-01 to 2019-06-30.csv',
 '/content/CSV/2019-10-01 to 2019-12-20.csv',
 '/content/CSV/2017-10-01 to 2017-10-31.csv',
 '/content/CSV/2019-01-01 to 2019-03-31.csv',
 '/content/CSV/2017-04-01 to 2017-05-31.csv']

In [0]:
def clean_engchar(line):
  line = re.sub(r"[0-9a-zA-Z]+", '', line)
  return line

In [0]:
def clean_punctuation(line):
  line = line.translate(str.maketrans('', '', string.punctuation))
  line = re.sub('‘|’', '', line)
  return line

In [0]:
def clean_eKantipur_start(line):
  line = re.sub(r"^.+\s—\s", '', line)
  return line

In [0]:
def clean_numbers(line):
  if bool(re.search(r'[०१२३४५६७८९]+',line)):
    matches = re.finditer(r'[०१२३४५६७८९]+', line)
    for match in matches:
      line = line.replace(line[match.span()[0]:match.span()[1]], '#'*(match.span()[1]-match.span()[0]))
    return line
  return line

In [0]:
import unicodedata
def clean_unicodes(line):
  line = unicodedata.normalize("NFKD", line)
  return line

In [0]:
def clean_text(line):
  return clean_unicodes(clean_numbers(clean_punctuation(clean_engchar(clean_eKantipur_start(line)))))

In [0]:
clean_unicodes('तीन दिनभन्दा लामो बिदा हुँदा सर्वाेच्चले नै जिल्ला अदालतहरूमा न्यायाधीश व्यवस्था गर्ने गरे पनि\xa0केही दिनदेखि झापा मोरङ र सुनसरी जिल्ला अदालतमा न्यायाधीश नभएको काफ्लेले बताए नेपाल क्याटेगोरीको नाममा हामी कसैको सती किन जानु\u202f उनले त्यो बेलाको छलफलमा नेपाल क्याटेगोरी लाई सामान्य ढंगले लिएर ताली बजाए पनि विवाद निम्तिन सक्ने जोखिमलाई ध्यानमा राखेर हटाउन सकिने बताए\n')

'तीन दिनभन्दा लामो बिदा हुँदा सर्वाेच्चले नै जिल्ला अदालतहरूमा न्यायाधीश व्यवस्था गर्ने गरे पनि केही दिनदेखि झापा मोरङ र सुनसरी जिल्ला अदालतमा न्यायाधीश नभएको काफ्लेले बताए नेपाल क्याटेगोरीको नाममा हामी कसैको सती किन जानु  उनले त्यो बेलाको छलफलमा नेपाल क्याटेगोरी लाई सामान्य ढंगले लिएर ताली बजाए पनि विवाद निम्तिन सक्ने जोखिमलाई ध्यानमा राखेर हटाउन सकिने बताए\n'

In [0]:
import re
for file in files:
  df = pd.read_csv(file)
  descriptions = df['description'].unique()
  for des in descriptions:
    lines = re.split('।|\n|।\n', des)
    with open('NepText.txt', 'a') as f:
      for line in lines:
        line = clean_text(line)
        if line == '':
          pass
        else:
          f.write(line.strip() + '\n')
  f.close()

In [0]:
# saving the preprocessed text to gdrive
!cp /content/NepText.txt "/content/gdrive/My Drive/NLP/"

In [0]:
# copying preprocessed text to colab from gdrive
!cp "/content/gdrive/My Drive/NLP/NepText.txt" /content/

In [0]:
with open('/content/NepText.txt') as f:
  data = list(set(f.readlines()))
  f.close()

In [0]:
from tqdm import tqdm

In [0]:
# # check nepali numbers
# import re
# matches = []
# for line in data:
#   match = re.finditer(r"[०१२३४५६७८९]+", line)
#   matches.extend(match)

In [0]:
# # Check english characters
# import re
# matches = []
# lines = []
# for i,line in enumerate(data):
#   match = re.finditer(r"[0-9a-zA-Z]+", line)
#   lines.append(i)
#   matches.extend(match)

In [0]:
# #data checking
# for line in data:
#   if 'भएन—' in line:
#     print(line)

In [0]:
from keras.preprocessing.text import Tokenizer

In [0]:
vocab_size = 62549 + 1 # This vocab size is selected so there are atleast 3 times repeated words
tokenizer = Tokenizer(num_words=vocab_size, filters='\n')

In [0]:
tokenizer.fit_on_texts(data)

In [0]:
import pickle

In [0]:
# saving tokenizer for further usage
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
# saving tokenizer data to gdrive
!cp /content/tokenizer.pickle "/content/gdrive/My Drive/NLP/"

In [0]:
# loading tokenizer from gdrive
!cp "/content/gdrive/My Drive/NLP/tokenizer.pickle" .

In [0]:
# loading tokenizer from pickle file
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [0]:
from keras.preprocessing.sequence import make_sampling_table, skipgrams

In [0]:
import numpy as np

In [0]:
train_data = np.array(tokenizer.texts_to_sequences(data))

In [0]:
sampling_table = make_sampling_table(size=vocab_size)

In [0]:
pairs = []
labels = []
for row in tqdm(train_data):
  pair, label = skipgrams(row,vocabulary_size=vocab_size,window_size=5,negative_samples=15,sampling_table=sampling_table)
  pairs.extend(pair)
  labels.extend(label)

100%|██████████| 229441/229441 [05:57<00:00, 641.68it/s]


In [0]:
len(labels)

134318704

In [0]:
pairs = np.array(pairs)
labels = np.array(labels)

In [0]:
labels = labels.reshape(len(labels),1)

In [0]:
print(pairs.shape)
print(labels.shape)

(134318704, 2)
(134318704, 1)


In [0]:
training_data = np.concatenate((pairs, labels), axis=1)

In [0]:
# saving the dataset for training
np.save('NepWord2VecTrainingData.npy', training_data)

In [0]:
# saving training data to gdrive
!cp /content/NepWord2VecTrainingData.npy "/content/gdrive/My Drive/NLP/"

In [0]:
# loading training data from gdrive
!cp "/content/gdrive/My Drive/NLP/NepWord2VecTrainingData.npy" .

cp: cannot stat '/content/gdrive/My Drive/NLP/NepWord2VecTrainingData.npy': No such file or directory
