# DATASET PREPARATION

In [2]:
'''
    Nepali NLP preprocessing
    Removing punctuation/symbols/digits
    except '- and :'
    
    Coverting the records into [label, data]
    
    # Removing BOM, newline, tab characters from dataset
    # Remove punctation and nepali digits except '-:'
    # P - any kinds of punctuation
    # N - any kinds of number
    # S - any kinds of symbol
    # Cf - Other, Format (ZERO WIDTH SPACE, ZERO WIDTH NON-JOINER)
    # Cn - Not assigned, Format
    # Cc - Other, Control category (tab)
    # Pd - Punctuation dash (hyphen-minus)
    

    # References
        - https://www.fileformat.info/info/unicode/category/index.htm
        - http://www.nepalinlp.com/pre-processing/
    
'''

import os
import sys
import time
import re
import unicodecsv as csv
import unicodedata as un

in_path = './data/raw_backup/'
out_path= './data/raw/raw.csv'

head = 'label', 'data'

counter = 0
error_counter = 0

# Start timer
start_time = time.time()
label = -1

# Prepare dictionary of necessary unicode
# Thanks to https://stackoverflow.com/a/11066687/4595807
# We want to protect '-'
# HYPHEN-MINUS = UNICODE DECIMAL VALUE = 45
table = dict.fromkeys(i for i in range(sys.maxunicode) 
                        if un.category(chr(i)).startswith(('P','N','S','Cf','Cn','Cc'))
                        and i != 45)
#                         and not un.category(chr(i)).startswith('Pd'))

with open(out_path, 'wb') as out_file:
    writer = csv.writer(out_file, encoding='utf-8')
    writer.writerow(head)
    
    for root, dirs, files in os.walk(in_path, topdown=True):
        for name in files:
            curr_file = os.path.join(root, name)
            try:
                # Read current file and remove BOM and newline characters
                # uf03c and uf03c are invalid unicode characters and don't have any category
                # so had to remove manually
                fp = open(curr_file, encoding='utf-8-sig').read()
                fp = fp.translate(table)
                
                # Remove extra spaces and hyphens but does not remove that's between words आ-आफ्नो
                # Conditional removal of HYPHEN-MINUS
                # Looks for space before/after hyphen, if present remove it
                fp = re.sub(r"(?<!\w)[-]|[-](?!\w)",'',fp)
                
                # Normalize the unicode so that
                # canonical-equivalent ones will also have precisely the same binary representation
                final_msg = label, un.normalize('NFC', fp)
                
                # Write into CSV file format - label, data
                writer.writerow(final_msg)

                # Counter setup to count file processed
                counter = counter + 1

            except IOError as e:
                print ("I/O error({0}): {1}".format(e.errno, e.strerror))
                error_counter = error_counter + 1
                
        label += 1

out_file.close()
    
end_time = time.time()

print('Number of files processed: ',counter)
print('Number of files error: ',error_counter)

print('Time taken in seconds:',(end_time - start_time))

In [60]:
'''
    Main description:
        Create a file to be used in dataturk purpose
        Collect all the files from raw folder hierarchy
        Break down into sentences
        Shuffle it randomly
        Create a file of required number of samples
'''

import os
import sys
import time

in_path = '../data/raw_backup/'
out_path= '../data/raw/raw_unproc.csv'

counter = 0
error_counter = 0

# Start timer
start_time = time.time()

lines = []
number_of_sample= 5000

for root, dirs, files in os.walk(in_path, topdown=True):
    for name in files:
        curr_file = os.path.join(root, name)
        try:
            fp = open(curr_file, encoding='utf-8-sig').read()
            fp = fp.replace('\n', '').replace('\t','').split()
            fp = ' '.join(fp).replace('।', '।\n')
            fp = fp.splitlines()
            for each_line in fp:
                length = len(each_line.split())
                # This condition is to remove sentence like ﻿१४ मंसिर, काठमाडौं ।
                # It is common on the starting of news document
                if length > 4:
                    final_data = each_line.lstrip()+'\n'
                    lines.append(final_data)


            # Counter setup to count file processed
            counter = counter + 1

        except IOError as e:
            print ("I/O error({0}): {1}".format(e.errno, e.strerror))
            error_counter = error_counter + 1
            

with open(out_path, 'w', encoding='utf-8') as out_file:    
    lines = random.sample(lines, number_of_sample)
    for line in lines:
        out_file.write(line)
    
    
out_file.close()
    
end_time = time.time()

print('Number of files processed: ',counter)
print('Number of files error: ',error_counter)

print('Time taken in seconds:',(end_time - start_time))

Number of files processed:  14125
Number of files error:  0
Time taken in seconds: 5.093601942062378


In [4]:
'''
    How to find unicodedata category of all punctuation
'''

tbl = "-!\"#$%&'()*+,./:;<=>?@[\]^_`{}~१२३४५६७८९०"

exclude_list = ["COLON","HYPHEN-MINUS"]

for each in tbl:
    print(each + ":" + un.name(each))

-:HYPHEN-MINUS
!:EXCLAMATION MARK
":QUOTATION MARK
#:NUMBER SIGN
$:DOLLAR SIGN
%:PERCENT SIGN
&:AMPERSAND
':APOSTROPHE
(:LEFT PARENTHESIS
):RIGHT PARENTHESIS
*:ASTERISK
+:PLUS SIGN
,:COMMA
.:FULL STOP
/:SOLIDUS
::COLON
;:SEMICOLON
<:LESS-THAN SIGN
=:EQUALS SIGN
>:GREATER-THAN SIGN
?:QUESTION MARK
@:COMMERCIAL AT
[:LEFT SQUARE BRACKET
\:REVERSE SOLIDUS
]:RIGHT SQUARE BRACKET
^:CIRCUMFLEX ACCENT
_:LOW LINE
`:GRAVE ACCENT
{:LEFT CURLY BRACKET
}:RIGHT CURLY BRACKET
~:TILDE
१:DEVANAGARI DIGIT ONE
२:DEVANAGARI DIGIT TWO
३:DEVANAGARI DIGIT THREE
४:DEVANAGARI DIGIT FOUR
५:DEVANAGARI DIGIT FIVE
६:DEVANAGARI DIGIT SIX
७:DEVANAGARI DIGIT SEVEN
८:DEVANAGARI DIGIT EIGHT
९:DEVANAGARI DIGIT NINE
०:DEVANAGARI DIGIT ZERO


# CORPUS PREPARATION

In [1]:
'''
    Removing UTF-8-BOM completely
    and preparing for corpus reader
    
    Description:
    Walks along every file inside subdirectories
    Read and removes UTF-8-BOM and new line from every file
    Write the file content in UTF-8 encode
    If the file is not encodeable, then removes it
    
    Store all the file content into a txt file
    Naming based on its parent directory
    
    Author - Oyesh Mann Singh
    Date - 10/25/2018
    
    Dataset folder structure:
        ./raw
            /Auto
                /0.txt
                /1.txt
            /Blog
                /0.txt
                1.txt
            /Sports
                /0.txt
                /1.txt
            
    Dataset:
    https://github.com/sndsabin/Nepali-News-Classifier
'''

import nltk
import sys
import unicodedata
import re
import string
import os
import time
from pathlib import Path

in_path = './data/raw_backup/'
out_path = './data/corpus/'

counter = 0
del_counter = 0

# Prepare dictionary to remove unnecessary unicode
table = dict.fromkeys(i for i in range(sys.maxunicode) 
                        if unicodedata.category(chr(i)).startswith(('P','N','S','Cf','Cn','Cc'))
                        and i != 45 and i!= 2404)

# Start timer
start_time = time.time()

for dname in os.listdir(in_path):    
    out_fname = dname + '_corpus.txt'
    full_out_path = os.path.join(out_path, out_fname) 
    
    if os.path.isfile(full_out_path):
        os.remove(full_out_path)
    
    out_file = open(full_out_path, 'w', encoding='utf8')
    
    full_in_path = os.path.join(in_path,dname)
    for fname in os.listdir(full_in_path):
        try:
            curr_file = os.path.join(full_in_path,fname)
            fp = open(curr_file,encoding='utf-8-sig').read()
    
            # Remove unnecessary characters
            fp = fp.translate(table)

            # Replace newline instead of DANDA
            fp = fp.replace("।",'\n')
            
            # Remove extra spaces and hyphens but does not remove that's between words आ-आफ्नो
            # Conditional removal of HYPHEN-MINUS
            # Looks for space before/after hyphen, if present remove it
            fp = re.sub(r"(?<!\w)[-]|[-](?!\w)",'',fp)
            
            out_file.write(fp)
            counter = counter + 1
            
        except:
            os.remove(curr_file)
            del_counter = del_counter + 1
            
    out_file.close()

end_time = time.time()

print('Number of files processed: ', counter)
print('Number of files removed: ', del_counter)

print('Time taken to create corpus (seconds):',(end_time - start_time))

Number of files processed:  14125
Number of files removed:  0
Time taken to create corpus (seconds): 9.28983211517334


In [2]:
'''
    Create train/test dataset
'''

import pandas as pd

raw_data = pd.read_csv('./data/raw/raw.csv')
raw_data = raw_data.sample(frac=1).reset_index(drop=True)
raw_data.head()

Unnamed: 0,label,data
0,10,राजविराज फागुन विगत महिनायताको लामो मधेश आन...
1,10,माघ काठमाडौं विद्युत व्यापार सम्झौता पीटीए क...
2,10,मंसिर रौतहट केहि दिनदेखि मौसममा आएको स्वभावि...
3,3,प्रा डा गोविन्द नेपालप्रमुख आर्थिक सल्लाहकार अ...
4,10,तर संविधानमा समाजवाद किन लेखियो असोज काठमाडौं...


In [3]:
train = raw_data.sample(frac=0.75, random_state=99).reset_index(drop=True)

In [4]:
train.head()

Unnamed: 0,label,data
0,10,थारु सभासदहरुले कैलालीको टीकापुरमा भएको झडपमा ...
1,10,राजविराज चैत सप्तरी प्रहरीले पातो गाविसबाट स...
2,1,गजेन्द्र बुढाथोकीकाठमाडौं माघ बैंकिङ क्षेत्र...
3,10,उपप्रधानमन्त्री कमल थापाले संविधानमा असहमति हु...
4,10,असोज काठमाडौं अमेरिकी अधिकारीहरुले आधिकारिक ...


In [5]:
train.to_csv('./data/raw/train.csv', index=False)

In [6]:
test = raw_data.loc[~raw_data.index.isin(train.index), :].reset_index(drop=True)

In [7]:
test.shape

(3531, 2)

In [8]:
test.head()

Unnamed: 0,label,data
0,12,भारत भ्रमण सकेर फर्किएपछि पूर्वाराजा ज्ञानेन्द...
1,10,काठमाडौं भूकम्प पीडितको राहत र उद्धारको नाममा...
2,10,रवीन्द्र घिमिरे माघ काठमाडौं सरकारले नेपाली य...
3,10,श्यामसुन्दर पासवानकञ्चनपुर वैशाख बिराटनगरमा ...
4,12,वैशाख काठमाडौं इण्डियन प्रिमियर लिग क्रिकेट ...


In [9]:
test.to_csv('./data/raw/test.csv', index=False)

In [8]:
import pandas as pd

In [9]:
d = {'data':[]}

df = pd.DataFrame(data=d)

In [16]:
df.data.add('Oyesh')

Series([], Name: data, dtype: object)

In [17]:
print(df['data'])

Series([], Name: data, dtype: object)
