In [58]:
import torch
from torchtext import data, vocab, datasets
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize=list, use_vocab=True, unk_token='<unk>', batch_first=True)
TITLE = data.Field(tokenize=list, use_vocab=True, unk_token='<unk>', batch_first=True)
LABEL = data.Field(unk_token=None, batch_first=True)

fields = (('LABEL', LABEL), ('TITLE', TITLE), ('TEXT', TEXT))

train_ds, test_ds = data.TabularDataset.splits(path='./data/ag_news_csv/kfold/1/', 
                                            format='csv', 
                                            train='train.txt',
                                            test = 'test.txt',
                                            fields=fields)

In [49]:
vec = vocab.Vectors(name='nep_english.vec', cache='./data/embeddings')

TEXT.build_vocab(train_ds, test_ds, max_size=None, vectors=vec)
TITLE.build_vocab(train_ds, test_ds, max_size=None, vectors=vec)
LABEL.build_vocab(train_ds.LABEL, test_ds.LABEL, max_size=None, vectors=vec)

In [50]:
batch_size = 1
train_iter, test_iter = data.BucketIterator.splits(datasets=(train_ds, test_ds), 
                                    batch_sizes=(batch_size, batch_size), 
                                    sort_key=lambda x: len(x.TEXT), 
                                    device='cpu', 
                                    sort_within_batch=True, 
                                    repeat=False,
                                    shuffle=True)

In [51]:
sample = next(iter(train_iter))

In [63]:
ex = train_ds[0]
print(''.join(ex.TEXT).split())
print(''.join(ex.TITLE).split())
print(''.join(ex.LABEL))

['Microsoft', 'is', 'offering', 'a', 'patch', 'for', 'a', 'critical', 'security', 'flaw', 'related', 'to', 'the', 'processing', 'of', 'JPEG', 'images', 'by', 'its', 'operating', 'systems', 'and', 'other', 'applications.']
['Network', 'Security', 'Microsoft', 'Image', 'Flaw', 'Opens', 'Door', 'to', 'Hackers']
4


In [67]:
train_dlen = len(train_iter)
t = tqdm(iter(train_iter), leave=False, total=train_dlen)
for (k, v) in t:
    (y, tl, X) = k
    print(y)
    print(tl)
    print(X)
    break









  0%|          | 0/96035 [00:00<?, ?it/s][A[A[A[A[A[A





[A[A[A[A[A[A                       




[A[A[A[A[A

tensor([[ 2]])
tensor([[ 34,  30,  29,   2,   2,   4,   5,  25,   5,   2,   2,  19,
           6,   9,   2,   2,  43,   8,  10,  13,  53,   2,   2,  22,
           4,  10,   2,   2,   6,  10,   2,   2,  20,  27,  39,   2]])
tensor([[ 32,   6,   2,  25,   2,   2,   5,  12,   3,   2,   2,  28,
          31,  39,   2,   2,  53,   9,   6,  15,  17,   2,   2,   7,
          10,   2,   2,   5,  12,   3,   2,   2,  14,   6,  17,  20,
           9,   7,  18,  12,   5,   2,   2,  24,   7,   6,  11,   4,
           5,   6,   9,   2,  25,   2,   2,   4,  14,  14,   6,   9,
          13,   7,   8,  18,   2,   2,   5,   6,   2,   2,  38,   7,
          18,   2,   2,  38,  11,  15,   3,  59,  10,   2,   2,  11,
           4,   5,   3,  10,   5,   2,   2,  19,   7,  11,   7,   8,
          18,  23]])


In [71]:
import os
import argparse
import pandas as pd
import numpy as np
from utility.dataloader import Dataloader

from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc='Progress')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from uniseg.graphemecluster import grapheme_clusters

class LSTMTagger(nn.Module):
    def __init__(self, config, dataloader):
        super(LSTMTagger, self).__init__()
        self.bidirectional = config.bidirection
        self.num_layers = config.num_layers
        self.batch_size = config.batch_size
        self.hidden_dim = config.hidden_dim
        self.vocab_size = dataloader.vocab_size
        self.tagset_size = dataloader.tagset_size
        self.embedding_dim = config.embedding_dim
        self.device = config.device
#         self.use_pos = config.use_pos
        
#         if self.use_pos:
#             self.pos_size = dataloader.pos_size
#             self.embedding_dim = config.embedding_dim + self.pos_size
#             pos_one_hot = np.eye(self.pos_size)
#             one_hot_weight = torch.from_numpy(pos_one_hot).float()
#             self.one_hot_embeddings = nn.Embedding(self.pos_size, self.pos_size, _weight=one_hot_weight)
        
        if config.pretrained:
            self.word_embeddings = nn.Embedding.from_pretrained(dataloader.weights)
        else:
            self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
        
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, 
                            bidirectional=self.bidirectional, 
                            num_layers=self.num_layers)
        
        if self.bidirectional:
            self.hidden2tag = nn.Linear(self.hidden_dim * 2, self.tagset_size)
        else:
            self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)      
        
        self.dropout = nn.Dropout(config.dropout)
#         self.dropout_embed = nn.Dropout(config.dropout_embed)
        
#         self.extra_layer = nn.Linear(2*self.hidden_dim, self.hidden_dim)
#         self.final_layer = nn.Linear(self.hidden_dim, self.tagset_size)

    def init_hidden(self, tensor_size):
        if self.bidirectional:
            h0 = torch.zeros(2 * self.num_layers, tensor_size[1], self.hidden_dim)
            c0 = torch.zeros(2 * self.num_layers, tensor_size[1], self.hidden_dim)         
        else:
            h0 = torch.zeros(self.num_layers, tensor_size[1], self.hidden_dim)
            c0 = torch.zeros(self.num_layers, tensor_size[1], self.hidden_dim)         
        if self.device:
            h0 = h0.to(self.device)
            c0 = c0.to(self.device)
        return (h0, c0)
    
    
    def forward(self, X, at):
        X = self.word_embeddings(X)
        at = self.word_embeddings(y)
        
        X = torch.cat((X, at), dim=-1)
        
        X, _ = self.lstm(self.dropout(X))
        
        tag_space = self.hidden2tag(X.view(-1, X.shape[2]))
        tag_scores = F.log_softmax(tag_space, dim=1)

        return tag_scores

In [69]:
input_file = './data/dataset/total.conll'
output_file = './data/dataset/total_clean.conll'

with open(input_file,'r', encoding='utf-8') as in_file, open(output_file,'w', encoding='utf-8') as out_f:
    sentence = []
    tag = []
    max_length=0
    max_sentence=''
    max_counter=0
    min_counter=0
    sent_counter=0
    line_num=0
    j=0
    for i,row in enumerate(in_file):
        #To know which line is defunct in file
        #print(i+1)
        row = row.strip().split()

        if len(row)>=4:
            out_f.write("{}\t{}\t{}\t{}\n".format(row[0], row[1], row[2], row[3]))
        elif len(row) == 0:
            out_f.write("\n")

In [57]:
x = torch.rand(2, 5)
x


tensor([[ 0.8176,  0.4855,  0.8950,  0.5104,  0.2835],
        [ 0.2434,  0.5992,  0.0913,  0.2088,  0.6545]])

In [58]:
y = torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]])
y.shape

torch.Size([2, 5])

In [59]:
torch.zeros(3, 5).scatter_(0, y, x)

tensor([[ 0.8176,  0.5992,  0.0913,  0.5104,  0.2835],
        [ 0.0000,  0.4855,  0.0000,  0.2088,  0.0000],
        [ 0.2434,  0.0000,  0.8950,  0.0000,  0.6545]])

In [49]:
torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]).shape

torch.Size([2, 5])

In [70]:
z = torch.zeros(3, 4).scatter_(1, torch.tensor([[1], [2], [3]]), 1)
z

tensor([[ 0.,  1.,  0.,  0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.]])

In [71]:
torch.tensor([[2], [3]])

tensor([[ 2],
        [ 3]])

In [79]:
word_emb = torch.randn(1,5,5)
word_emb

tensor([[[ 0.1852, -0.3144, -0.1508, -0.7691, -0.0778],
         [-0.6281, -1.0593, -1.2597,  1.0248, -1.0446],
         [ 0.7676,  1.0655,  0.6993, -0.3534,  0.3386],
         [-0.9379,  0.5636, -0.9402,  1.4113, -0.2798],
         [-0.2377,  1.2964, -0.0705,  0.6690,  0.9116]]])

In [82]:
import torch.nn.functional as F

In [84]:
torch.nn.functional.one_hot(torch.arange(0, 5) % 3)

AttributeError: module 'torch.nn.functional' has no attribute 'one_hot'

In [7]:
import numpy as np
from sklearn.metrics import roc_auc_score

y_true = np.array([[0,0,1,1], [0,0,0,1]])
y_scores = np.array([[0,1,1,1], [0,0,1,1]])
roc_auc_score(y_true, y_scores, average='micro')

0.8

In [8]:
y_true.shape

(2, 4)

In [60]:
gold = [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 3, 1, 1, 2, 3, 1, 3, 1, 2, 2, 3, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 4, 1, 1, 1, 2, 1, 2, 1, 1, 1, 4, 1, 1, 3, 1, 1, 4, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 2, 1, 2, 1, 1, 4, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 3, 2, 3, 2, 2, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 4, 1, 2, 3, 1, 1, 4, 2, 1, 1, 1, 4, 2, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 1, 4, 2, 1, 1, 2, 2, 1, 2, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 4, 2, 3, 1, 1, 1, 4, 1, 4, 1, 1, 1, 1, 3, 2, 3, 1, 2, 1, 1, 1, 2, 4, 1, 3, 1, 1, 2, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 4, 4, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 4, 1, 1, 4, 1, 4, 1, 2, 1, 1, 1, 1, 1, 1, 4, 4, 1, 3, 1, 1, 1, 1, 2, 3, 1, 1, 2, 1, 1, 2, 1, 1, 1, 4, 1, 1, 1, 4, 2, 1, 3, 1, 3, 1, 1, 1, 1, 1, 4, 1, 2, 1, 1, 3, 1, 2, 4, 1, 1, 1, 4, 1, 3, 3, 1, 1, 2, 4, 1, 1, 1]
pred = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [61]:
n_values = np.max(gold) + 1
gold_onehot = np.eye(n_values)[gold]
pred_onehot = np.eye(n_values)[pred]

In [62]:
gold_onehot.shape

(411, 5)

In [63]:
gold_onehot = np.delete(gold_onehot,0,1)
pred_onehot = np.delete(pred_onehot,0,1)

In [64]:
pred_onehot

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [65]:
roc_auc_score(gold_onehot, pred_onehot, average='micro')

0.7956204379562045

## Aspect Based Sentiment Classification

### Modify the result file

In [58]:
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import csv
import glob

y_pred = []
y_true = []

y_pred_aspect = []
y_true_aspect = []

y_pred_feed = []
y_true_feed = []

for file in glob.glob("./results/ss_ac_at_txt_unbal_lstm_3*.txt"):
    with open(file, newline='') as tsvin:
        tsvin = csv.reader(tsvin, delimiter='\t')
        for row in tsvin:
            if row:
                aspect = row[-3]
                gold = row[-2]
                pred = row[-1]
                y_true.append(gold)
                y_pred.append(pred)                    
                y_true_aspect.append(aspect+'_'+gold)
                y_pred_aspect.append(aspect+'_'+pred)                     
                
# print("Classification Report without Aspect")                
print(classification_report(y_true, y_pred, digits=3))
# 
# print("Classification Report with Aspect")                
print(classification_report(y_true_aspect, y_pred_aspect, digits=3))

# print("Classification Report with Aspect")                
print(classification_report(y_true_aspect, y_pred_aspect, digits=3, labels=['FEEDBACK_0','FEEDBACK_1']))

print(classification_report(y_true_aspect, y_pred_aspect, digits=3, labels=['GENERAL_0','GENERAL_1']))

print(classification_report(y_true_aspect, y_pred_aspect, digits=3, labels=['PROFANITY_0','PROFANITY_1']))

print(classification_report(y_true_aspect, y_pred_aspect, digits=3, labels=['VIOLENCE_0','VIOLENCE_1']))

              precision    recall  f1-score   support

           0      0.812     0.790     0.801       945
           1      0.819     0.838     0.829      1070

    accuracy                          0.816      2015
   macro avg      0.816     0.814     0.815      2015
weighted avg      0.816     0.816     0.816      2015

              precision    recall  f1-score   support

  FEEDBACK_0      0.837     0.979     0.902       189
  FEEDBACK_1      0.333     0.053     0.091        38
   GENERAL_0      0.826     0.753     0.788       530
   GENERAL_1      0.860     0.906     0.883       892
 PROFANITY_0      0.801     0.865     0.832       163
 PROFANITY_1      0.542     0.426     0.477        61
  VIOLENCE_0      0.550     0.349     0.427        63
  VIOLENCE_1      0.598     0.772     0.674        79

    accuracy                          0.816      2015
   macro avg      0.668     0.638     0.634      2015
weighted avg      0.805     0.816     0.806      2015

              precisio

## Count the unigrams, bigrams of each aspect

In [55]:
import pandas as pd

df = pd.read_csv("./playground/file.txt", names=['polarity', 'aspect', 'aspect_term', 'text'])
df = df.drop(columns=['text'])
df.head()

Unnamed: 0,polarity,aspect,aspect_term
0,0,FEEDBACK,अन्त्य गर्नुपर्छ
1,0,PROFANITY,चोर
2,1,GENERAL,लुटे को रहेछ
3,0,GENERAL,खुसि को दिन
4,1,GENERAL,पाजी


In [56]:
df['at_length'] = df['aspect_term'].apply(lambda x: len(x.split()))

In [57]:
df.head()

Unnamed: 0,polarity,aspect,aspect_term,at_length
0,0,FEEDBACK,अन्त्य गर्नुपर्छ,2
1,0,PROFANITY,चोर,1
2,1,GENERAL,लुटे को रहेछ,3
3,0,GENERAL,खुसि को दिन,3
4,1,GENERAL,पाजी,1


In [58]:
df.groupby(['aspect','at_length']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,polarity,aspect_term
aspect,at_length,Unnamed: 2_level_1,Unnamed: 3_level_1
FEEDBACK,1,75,75
FEEDBACK,2,324,324
FEEDBACK,3,85,85
FEEDBACK,4,21,21
FEEDBACK,5,3,3
GENERAL,1,1272,1272
GENERAL,2,931,931
GENERAL,3,422,422
GENERAL,4,139,139
GENERAL,5,40,40


In [73]:
# df.groupby(['aspect', 'polarity'])['aspect_term'].nunique()
df.groupby(['aspect'])['aspect_term'].nunique()

aspect
FEEDBACK       463
GENERAL       2012
PROFANITY      221
VIOLENCE       257
Name: aspect_term, dtype: int64

In [74]:
# df.groupby(['aspect', 'polarity'])['aspect_term'].count()
df.groupby(['aspect'])['aspect_term'].count()

aspect
FEEDBACK       508
GENERAL       2835
PROFANITY      407
VIOLENCE       285
Name: aspect_term, dtype: int64

### Split into 80/10/10 by group

In [224]:
import pandas as pd

df_txt = pd.read_csv('./data/nepcls/ss_ac_at_txt_unbal.csv', delimiter=',', encoding='utf-8', 
                         skip_blank_lines=True, header=None, names=['ss', 'ac', 'at', 'text'])

df_txt.head()

Unnamed: 0,ss,ac,at,text
0,0,GENERAL,जोगाउन को लागि,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा मा...
1,1,GENERAL,लखेटनु पछ,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब ...
2,1,GENERAL,ससकृती ध्वस्त पार्ने,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !
3,1,GENERAL,भुमाफिया,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को न...
4,1,GENERAL,बेची सके,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ ...


In [225]:
df_txt.groupby(['ac','ss']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,at,text
ac,ss,Unnamed: 2_level_1,Unnamed: 3_level_1
FEEDBACK,0,426,426
FEEDBACK,1,82,82
GENERAL,0,1052,1052
GENERAL,1,1783,1783
PROFANITY,0,302,302
PROFANITY,1,105,105
VIOLENCE,0,114,114
VIOLENCE,1,171,171


In [226]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split


df_txt = pd.read_csv('./data/nepcls/ss_ac_at_txt_unbal.csv', delimiter=',', encoding='utf-8', 
                         skip_blank_lines=True, header=None, names=['ss', 'ac', 'at', 'text'])

gss = GroupShuffleSplit(test_size=.20, n_splits=1, random_state = 163).split(df_txt, groups=df_txt['ss'])

for positive_df, negative_df in gss:

    negative = df_txt.iloc[negative_df]
    positive = df_txt.iloc[positive_df]

    train_neg, test_val_neg = train_test_split(negative, test_size=0.2)
    train_pos, test_val_pos = train_test_split(positive, test_size=0.2)
    test_neg, val_neg = train_test_split(test_val_neg, test_size=0.5)
    test_pos, val_pos = train_test_split(test_val_pos, test_size=0.5)

    train_df = pd.concat([train_pos, train_neg], ignore_index=True).sample(frac=1).reset_index(drop=True)
    test_df = pd.concat([test_pos, test_neg], ignore_index=True).sample(frac=1).reset_index(drop=True)
    val_df = pd.concat([val_pos, val_neg], ignore_index=True).sample(frac=1).reset_index(drop=True)

    train_df.to_csv('train_fname.csv', header=False, index=False, quoting=csv.QUOTE_MINIMAL, quotechar="",  escapechar=" ", encoding='utf-8')
    test_df.to_csv('test_fname.csv', header=False, index=False, quoting=csv.QUOTE_MINIMAL, quotechar="",  escapechar=" ", encoding='utf-8')
    val_df.to_csv('val_fname.csv', header=False, index=False, quoting=csv.QUOTE_MINIMAL, quotechar="",  escapechar=" ", encoding='utf-8')

# for i,(positive_df, negative_df) in enumerate(gss):
#     negative = df_txt.iloc[negative_df]
#     positive = df_txt.iloc[positive_df]
    
#     train_neg, test_val_neg = train_test_split(negative, test_size=0.2)
#     train_pos, test_val_pos = train_test_split(positive, test_size=0.2)
#     test_neg, val_neg = train_test_split(test_val_neg, test_size=0.5)
#     test_pos, val_pos = train_test_split(test_val_pos, test_size=0.5)
    
#     train_df = pd.concat([train_pos, train_neg], ignore_index=True).sample(frac=1).reset_index(drop=True)
#     test_df = pd.concat([test_pos, test_neg], ignore_index=True).sample(frac=1).reset_index(drop=True)
#     val_df = pd.concat([val_pos, val_neg], ignore_index=True).sample(frac=1).reset_index(drop=True)
    
#     train_df.to_csv(train_fname, header=False, index=False, quoting=csv.QUOTE_NONE, quotechar="",  escapechar=" ", encoding='utf-8')
#     test_df.to_csv(test_fname, header=False, index=False, quoting=csv.QUOTE_NONE, quotechar="",  escapechar=" ", encoding='utf-8')
#     val_df.to_csv(val_fname, header=False, index=False, quoting=csv.QUOTE_NONE, quotechar="",  escapechar=" ", encoding='utf-8')
    

In [229]:
pd_test = pd.read_csv('./test_fname.csv', delimiter=',', encoding='utf-8', 
                         skip_blank_lines=True, header=None, 
                         quoting=csv.QUOTE_MINIMAL, names=['ss', 'ac', 'at', 'text'])
pd_test.describe()

Unnamed: 0,ss
count,403.0
mean,0.531017
std,0.499657
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [213]:
train_df.groupby('ac').count()

Unnamed: 0_level_0,ss,at,text
ac,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FEEDBACK,428,428,428
GENERAL,2252,2252,2252
PROFANITY,319,319,319
VIOLENCE,228,228,228


In [205]:
test_df.groupby('ac').count()

Unnamed: 0_level_0,ss,at,text
ac,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FEEDBACK,49,49,49
GENERAL,278,278,278
PROFANITY,41,41,41
VIOLENCE,35,35,35


In [206]:
# test_pos.groupby('ac').count()
val_df.groupby('ac').count()

Unnamed: 0_level_0,ss,at,text
ac,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FEEDBACK,58,58,58
GENERAL,281,281,281
PROFANITY,33,33,33
VIOLENCE,33,33,33


In [209]:
train_df

Unnamed: 0,ss,ac,at,text
0,0,GENERAL,महान,"मेयर ज्यु , तपाईं जस्तो महान मेयर हाम्रो नेपाल..."
1,1,PROFANITY,मुजि,यो माचिक्ने kp र प्रचन्द लाइ सिदै गोलि हान्छु ...
2,0,GENERAL,बबाल,ल बबाल भए छ 🤔 ️ ।
3,0,PROFANITY,साला,ओय साला पोखरेल तं साला कति झूठबोलना सकेहोला ।
4,1,GENERAL,जेल हाल्दिनु पर्छ,"सबै चोर हो , सबै जना लाई जेल हाल्दिनु पर्छ ।"
...,...,...,...,...
3222,1,GENERAL,तत्व,"रबि को रिहाईसँगै प्रचण्डे , रेणुदाहाल , किशोरश..."
3223,1,GENERAL,उखाने र गुखाने,अझ ठुलो नाइके त उखाने र गुखाने नै हुन ।
3224,0,FEEDBACK,सिक्न पर्यो,केसि पनि जिल्ल परे बरै कहा को प्रस्न कहाँ लगेर...
3225,1,GENERAL,अाफ्नाे खुट्टा मा अाफै बन्चरो,याे सरकार ले अाफ्नाे खुट्टा मा अाफै बन्चरो हान...


### Count English words vs Nepali words in dataset

In [108]:
file="./data/dataset/text_tag_only/text_only.txt"

word_count=0
word_num=0
sentences=[]
unique_word=set()
full_english_words = []
unique_english_words = []
pattern=r"^[a-zA-Z0-9]+$"

with open(file, newline='') as in_file:
    reader=in_file.readlines()
    for sent in reader:
        splitted=sent.rstrip().split()
        sentences.append(splitted)
        for each in splitted:
            if re.match(pattern, each):
                full_english_words.append(each)            
            unique_word.add(each)
        word_num+=len(splitted)

In [109]:
print("Count of unique word", len(unique_word))
print("Count of total words", word_num)
print("Count of total english words", len(full_english_words))

Count of unique word 11465
Count of total words 62163
Count of total english words 1133


In [110]:
print(len(full_english_words))

1133


In [111]:
import re
unique_english_words = []
for each in unique_word:
    if re.match(pattern, each):
        unique_english_words.append(each)
        
print("Unique english words count = ",len(unique_english_words))

Unique english words count =  671


In [113]:
total_english_num=len(full_english_words)
print("Ratio of english words to nepali words = ", total_english_num/(word_num-total_english_num))

Ratio of english words to nepali words =  0.018564640340815994
