In [4]:
!pip install --upgrade --no-cache-dir gdown -q
!gdown --id 1E8NPHI5lgY6RJpWau5WReG8woQnFZvXJ -q
!unrar x News.rar


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from News.rar


Would you like to replace the existing file News/train.csv
343500458 bytes, modified on 2020-10-21 18:37
with a new one
343500458 bytes, modified on 2020-10-21 18:37

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit n


Would you like to replace the existing file News/test.csv
62071897 bytes, modified on 2020-10-21 18:37
with a new one
62071897 bytes, modified on 2020-10-21 18:37

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit n

All OK


In [1]:
from collections import Counter, OrderedDict
import json
import operator
import pandas as pd
import numpy as np
import itertools
import random
import glob
import re
import matplotlib.pyplot as plt
import seaborn as sn
from pathlib import Path
import heapq
import scipy
import math
import torch
from torch import nn
import torch.nn.functional as F

# Auxiliary functions

In [2]:
def remove(text):
  text = text.replace('ھ','ه')
  text = text.replace('ئ','ی')
  text = text.replace('ؤ','و')
  text = text.replace('إ','ا')
  text = text.replace('أ','ا')
  text = text.replace('َ','')
  text = text.replace('ُ','')
  text = text.replace('ِ','')
  text = text.replace('ّ','')
  text = text.replace('ء','')
  text = text.replace('ـ','')
  text = text.replace('،',' ')
  text = re.sub('[^\u0621-\u0628\u062A-\u063A\u0641-\u0642\u0644-\u0648\u064E-\u0651\u0655\u067E\u0686\u0698\u06A9\u06AF\u06BE\u06CC\u06F0-\u06F9 ]', '', text)
  return re.sub("[^0-9\u0600-\u06FF]+", " ", text).strip()
  
def control_numeric(text):
  text = text
  pattern_integer = r'\d+'
  pattern_float = "\d+\.\d+"
  text = re.sub(pattern_float, 'N', text)
  text = re.sub(pattern_integer, 'N', text)
  return text

def add_tag(text):
  string = 's ' + text
  string = string + ' e'
  return string

def remove_nan(dataset):
  for col in dataset.columns:
    dataset = dataset[dataset[col].notna()]
  return dataset


def tokenizer(data):
    data = [x.split(' ') for x in data]
    return data

def get_max(predictions):
  maxs = []
  for dc in predictions:
    maxs.append(max(dc, key=lambda k: dc[k]))
  return maxs

def text2char(text):
  return list(text)


def windowing(text,length):
  sequences = []
  for i in range(length,len(text)):
    sequences.append(text[i-length:i+1])
  return sequences

def encoding(sequence,char2index):
  encoded_sequences = list()
  for text in sequence:
    encode_seq = [char2index[char] for char in text]
    encoded_sequences.append(encode_seq)  
  return encoded_sequences

def label_encoder(encoded_sequences):
  encoded_sequences = np.array(encoded_sequences)
  return encoded_sequences[:,:-1], encoded_sequences[:,-1]


def get_word_index(s, idx=5):
    words = re.findall(r'\s*\S+\s*', s)
    return sum(map(len, words[:idx])) + len(words[idx]) - len(words[idx].lstrip())

def dataset_labeling(data_array, batch_size, window_size=100):
  total_batch_size = batch_size * window_size
  n_batches = len(data_array)//total_batch_size
  
  data_array = data_array[:n_batches * total_batch_size]
  data_array = data_array.reshape((batch_size, -1))
  
  for i in range(0, data_array.shape[1], window_size):
      x = data_array[:, i:i+window_size]
      y = np.zeros_like(x)
      try:
          y[:, :-1], y[:, -1] = x[:, 1:], data_array[:, i+window_size]
      except IndexError:
          y[:, :-1], y[:, -1] = x[:, 1:], data_array[:, 0]
      yield x, y 

def one_hot_encode(data_array, n_labels):
  one_hot = np.zeros((np.multiply(*data_array.shape), n_labels), dtype=np.float32)    
  one_hot[np.arange(one_hot.shape[0]), data_array.flatten()] = 1.
  return one_hot.reshape((*data_array.shape, n_labels))
  
def edit_ditsance(sentence, generated):
    m=len(sentence)+1
    n=len(generated)+1

    tbl = {}
    for i in range(m): tbl[i,0]=i
    for j in range(n): tbl[0,j]=j
    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if sentence[i-1] == generated[j-1] else 1
            tbl[i,j] = min(tbl[i, j-1]+1, tbl[i-1, j]+1, tbl[i-1, j-1]+cost)

    return tbl[i,j]


def calculate_CER(sentence,generated):
  return 100 * (edit_ditsance(sentence, generated)/ len(sentence))

def get_key_by_value(dictionary, char):
  for key, value in dictionary.items():
    if value == char:
        return key


## DataProcessor

In [3]:
class DataProcessor:
  def __init__(self, 
               news_path,
               start=0,
               end=100):
        self.news_path = news_path
        self.read_data(start,end)
        print('-------------- read_data() Done --------------')
        self.clean_text()
        print('-------------- clean_text() Done --------------')
        self.count_chars()
        print('-------------- count_chars() Done --------------')
        self.tokenize()
        print('-------------- tokenize() Done --------------')
        self.encoding()
        print('-------------- encoding() Done --------------')


  def read_data(self,start,end):
    data = pd.read_csv(
        self.news_path,sep='\t', encoding = "utf-8-sig",on_bad_lines='skip')[['text']]
    self.data = data[(data.index>np.percentile(data.index, start)) & (data.index<=np.percentile(data.index, end))]

  def clean_text(self):
    data = self.data
    data = remove_nan(data)
    data['text'] = data['text'].apply(remove)
    data['text'] = data['text'].apply(control_numeric)
    data['text'] = data['text'].apply(add_tag)
    self.data = data

  def count_chars(self):
    self.all_chars = []
    for text in self.data['text']:
      self.all_chars.append(text2char(text))
    
    flat = [x for sublist in self.all_chars for x in sublist]
    frequencies = dict(Counter(flat))
    frequencies = dict(sorted(frequencies.items(),key=operator.itemgetter(1),reverse=True))

    self.n_unique_chars = len(frequencies.keys())
    self.n_chars = sum(frequencies.values())
    self.frequencies_dict = frequencies

    with open('frequent.txt', 'w') as file:
      i = 0
      for e in frequencies.keys():
        file.write(e + ' ' + str(frequencies[e])+ '\n')
        i+=1
        if i==200:
          break

    print('Number of all characters: ',self.n_chars)
    print('Number of unique characters: ',self.n_unique_chars)

  def tokenize(self):
    frequencies = self.frequencies_dict
    self.index2char = {}
    self.char2Index = {}
    i = 0
    
    for char in frequencies.keys():
      self.index2char[i] = char
      self.char2Index[char] = i
      i += 1
      if i == len(frequencies):
        break
    
    with open('index2Char.json', 'w') as fp:
      json.dump(self.index2char, fp)
    with open('char2Index.json', 'w') as fp:
      json.dump(self.char2Index, fp)
    
  def encoding(self):
    self.all_text = ''
    for row in self.data.itertuples():
        self.all_text = self.all_text + ' ' + str(row.text[:101])
    self.all_text = re.sub('s+', ' ', self.all_text)
    self.all_text.strip()
    self.encoded_data = np.array([self.char2Index[char] for char in self.all_text])


## LanguageModel

In [4]:
class LanguageModel(nn.Module):
  def __init__(self, encoded_data, unique_chars, n_hidden=512, n_layers=4, drop_prob=0.5, lr=0.001):
      super().__init__()
      self.drop_prob = drop_prob
      self.n_layers = n_layers
      self.n_hidden = n_hidden
      self.lr = lr
      
      self.unique_chars = unique_chars
      self.index2char = dict(enumerate(self.unique_chars))
      self.char2Index = {ch: ii for ii, ch in self.index2char.items()}
      self.data = encoded_data

  def init_weights(self):

    initrange = 0.1
    
    self.fc.bias.data.fill_(0)

    self.fc.weight.data.uniform_(-1, 1)

  def forward(self, x, hc):
      
      ## Get x, and the new hidden state (h, c) from the lstm
      x, (h, c) = self.lstm(x, hc)
      
      ## Ppass x through the dropout layer
      x = self.dropout(x)
      
      # Stack up LSTM outputs using view
      x = x.contiguous().view(x.size()[0]*x.size()[1], self.n_hidden)
      
      ## Put x through the fully-connected layer
      x = self.fc(x)
      
      # Return x and the hidden state (h, c)
      return x, (h, c)

  def convert_to_hiddens(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
              weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
    
    return hidden

  def define_model(self):
      self.lstm = nn.LSTM(len(self.unique_chars), self.n_hidden, self.n_layers, dropout=self.drop_prob, batch_first=True)
      self.dropout = nn.Dropout(self.drop_prob)
      self.fc = nn.Linear(self.n_hidden, len(self.unique_chars))
      self.init_weights()

  def train_char_LSTM(self, net, data, epochs=10, batch_size=32, window_size=100, lr=0.001, clip=5, val_frac=0.1, cuda=False, print_every=10):

      net.train()
      opt = torch.optim.Adam(net.parameters(), lr=lr)
      criterion = nn.CrossEntropyLoss()
     
    
      val_idx = int(len(self.data)*(1-val_frac))
      train_dataset, validation_dataset = self.data[:val_idx], self.data[val_idx:]
      
      net.cuda()
      counter = 0
      n_chars = len(self.unique_chars)

      for e in range(epochs):
          h = self.convert_to_hiddens(batch_size)
          
          for x, y in dataset_labeling(train_dataset, batch_size, window_size):
              counter += 1

              x = one_hot_encode(x, n_chars)
              inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
              inputs, targets = inputs.cuda(), targets.cuda()

              data=self.data
              h = tuple([each.data for each in h])

              net.zero_grad()
              output, h = net.forward(inputs, h)
              
              loss = criterion(output, targets.view(batch_size*window_size).type(torch.cuda.LongTensor))
              loss.backward()

              nn.utils.clip_grad_norm_(net.parameters(), clip)
              opt.step()
              
              if counter % print_every == 0:
                  val_h = self.convert_to_hiddens(batch_size)
                  val_losses = []
                  net.eval()
                  
                  for x, y in dataset_labeling(validation_dataset, batch_size, window_size):
                      
                      x = one_hot_encode(x, n_chars)
                      x, y = torch.from_numpy(x), torch.from_numpy(y)
                      
                      
                      val_h = tuple([each.data for each in h])
                      
                      inputs, targets = x, y
                      inputs, targets = inputs.cuda(), targets.cuda()

                      output, val_h = net.forward(inputs, val_h)
                      val_loss = criterion(output, targets.view(batch_size*window_size).type(torch.cuda.LongTensor))
                  
                      val_losses.append(val_loss.item())
                  
                  net.train()
                  
                  print("Epoch: {}/{}...".format(e+1, epochs),
                        "Step: {}...".format(counter),
                        "Loss: {:.4f}...".format(loss.item()),
                        "Val Loss: {:.4f}".format(np.mean(val_losses)))


  def get_next_state_and_output(self,net, char, hidden):
    x = np.array([[net.char2Index[char]]])
    x = one_hot_encode(x, len(net.unique_chars))
    X = torch.from_numpy(x)

    h = tuple([each.data for each in hidden])
    out, h = net(X.cuda(), h)
    return out, h

  def convert_prefix_to_hiddens(self,net,prefix):
    current_hidden = self.convert_to_hiddens(batch_size=1)
    out, h = self.get_next_state_and_output(net, prefix[0], current_hidden)
    for char in prefix[1:]:
      x = np.array([[net.char2Index[char]]])
      x = one_hot_encode(x, len(net.unique_chars))
      inputs = torch.from_numpy(x)
      h = tuple([each.data for each in h])
      out, h = net(inputs.cuda(), h)
    return out, h

  def get_probs(self, net, prefix):
    out, h = self.convert_prefix_to_hiddens(net, prefix)
    p = F.softmax(out, dim=1).data
    keys = (p[0].cpu().numpy())
    values = list(net.char2Index.keys())
    return dict(zip(keys, values))    

  
  def get_next_char(self, net, prefix):
    all_probs = self.get_probs(net, prefix)
    max_prob = random.choice(heapq.nlargest(2, list(all_probs.keys())))
    return all_probs[max_prob]


  def generate_text(self, net, prefix, k):
        char=""
        while ((char!='e') and (len(prefix) <= k)) :
            char = self.get_next_char(net, prefix)
            prefix += char
        if char != 'e':
            prefix = prefix + 'e'
        return prefix
                

  def get_overall_prob(self, net, sentence):
        log_prob = 0
        for i in range(1,len(sentence)-1):
            next_char = sentence[i+1]
            all_probs = self.get_probs(net, sentence[:i])
            log_prob += np.log(get_key_by_value(all_probs, next_char))
        return log_prob


  def evaluate(self, net, test):
    error_rate = 0.0
    log_likelihood = 0.0

    for i in range(len(test)):
      x = get_word_index(test['text'].iloc[i], idx=5)
      generated_text = self.generate_text(net, test['text'].iloc[i][:x], 50)
      error_rate += calculate_CER(test['text'].iloc[i],generated_text)    
      log_likelihood += self.get_overall_prob(net, generated_text)/len(self.unique_chars)
    
    character_error_rate = (error_rate / len(test)) * 100
    perplexity = 1 / (1 + np.exp((-1)*log_likelihood))
    return character_error_rate, perplexity


## Performance

In [5]:
Train_dp = DataProcessor('News/train.csv',start=0,end=40)

-------------- read_data() Done --------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


-------------- clean_text() Done --------------
Number of all characters:  63671693
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- tokenize() Done --------------
-------------- encoding() Done --------------


In [6]:
LM = LanguageModel(encoded_data=Train_dp.encoded_data, unique_chars=tuple(Train_dp.frequencies_dict.keys()))
LM.define_model()
LM

LanguageModel(
  (lstm): LSTM(37, 512, num_layers=4, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=37, bias=True)
)

In [None]:

batch_size = 64
window_size = 50
n_epochs = 1

LM.train_char_LSTM(LM,Train_dp.encoded_data, epochs=n_epochs,
                   batch_size=batch_size, window_size=window_size, print_every=1)



Epoch: 1/1... Step: 1... Loss: 3.6457... Val Loss: 3.1134
Epoch: 1/1... Step: 2... Loss: 3.1510... Val Loss: 3.0380
Epoch: 1/1... Step: 3... Loss: 3.1192... Val Loss: 2.9391
Epoch: 1/1... Step: 4... Loss: 3.0158... Val Loss: 2.9584
Epoch: 1/1... Step: 5... Loss: 3.0407... Val Loss: 2.9346
Epoch: 1/1... Step: 6... Loss: 3.0069... Val Loss: 2.9010
Epoch: 1/1... Step: 7... Loss: 2.9707... Val Loss: 2.8782
Epoch: 1/1... Step: 8... Loss: 2.9364... Val Loss: 2.8899
Epoch: 1/1... Step: 9... Loss: 2.9204... Val Loss: 2.8876
Epoch: 1/1... Step: 10... Loss: 2.9353... Val Loss: 2.8730
Epoch: 1/1... Step: 11... Loss: 2.9421... Val Loss: 2.8622
Epoch: 1/1... Step: 12... Loss: 2.9107... Val Loss: 2.8606
Epoch: 1/1... Step: 13... Loss: 2.9249... Val Loss: 2.8679
Epoch: 1/1... Step: 14... Loss: 2.9448... Val Loss: 2.8724
Epoch: 1/1... Step: 15... Loss: 2.9437... Val Loss: 2.8689
Epoch: 1/1... Step: 16... Loss: 2.9327... Val Loss: 2.8634
Epoch: 1/1... Step: 17... Loss: 2.9095... Val Loss: 2.8618
Epoch:

## Get_probs() Sample

In [8]:
LM.get_probs(LM, prefix="انتخا")


{0.00032162736: 'e',
 0.00040367394: 'N',
 0.0005101248: 'ژ',
 0.0006548689: 's',
 0.0008097274: 'آ',
 0.0008275509: 'چ',
 0.0009423145: 'ظ',
 0.0012082964: 'ذ',
 0.0013955449: 'گ',
 0.0016241978: 'ض',
 0.0017000367: 'ث',
 0.0017614838: 'غ',
 0.0018540353: 'و',
 0.0020526147: 'ا',
 0.0023990504: 'ق',
 0.0034291463: 'ح',
 0.0043661967: 'ج',
 0.0049889763: 'پ',
 0.005061446: 'ک',
 0.0061951657: 'ت',
 0.0073631895: 'ف',
 0.008025317: 'ط',
 0.008182397: 'ع',
 0.008707118: 'ص',
 0.00946943: 'خ',
 0.016344521: ' ',
 0.019773483: 'ز',
 0.019938922: 'ب',
 0.021542968: 'ش',
 0.021917198: 'ه',
 0.023191046: 'س',
 0.026945792: 'ل',
 0.029691327: 'د',
 0.043907925: 'م',
 0.060801726: 'ی',
 0.30166188: 'ن',
 0.33002967: 'ر'}

In [9]:
LM.get_probs(LM, prefix="سلا")


{0.00015371614: 's',
 0.00018961634: 'e',
 0.0002590321: 'ا',
 0.00026481587: 'چ',
 0.00029250453: 'N',
 0.00032283276: 'ذ',
 0.00040296314: 'ژ',
 0.0004934624: 'غ',
 0.00057173835: 'آ',
 0.0006586413: 'ظ',
 0.0007182893: 'ش',
 0.0008499607: 'ص',
 0.0009408724: 'ض',
 0.00096357823: 'گ',
 0.0010058681: 'پ',
 0.0011666031: 'ط',
 0.0012120103: 'ث',
 0.0013168021: 'خ',
 0.0014200592: 'ر',
 0.0014988626: 'ج',
 0.001524645: 'و',
 0.0017630483: 'ح',
 0.0030081237: 'ک',
 0.0032516997: 'ع',
 0.0045482498: 'ف',
 0.00480885: 'ق',
 0.006762034: 'ه',
 0.008542996: 'ز',
 0.009105222: 'ب',
 0.014887797: 'س',
 0.036302984: 'ت',
 0.037912138: 'ی',
 0.04678705: 'ل',
 0.060584355: ' ',
 0.06868335: 'ن',
 0.06993459: 'د',
 0.6068907: 'م'}

In [10]:
LM.get_probs(LM, prefix="فوتبالیس")

{0.0002243729: 'e',
 0.00034124768: 'N',
 0.00040384138: 'ث',
 0.00040544922: 's',
 0.00045313936: 'ظ',
 0.0004721463: 'ژ',
 0.00050417363: 'ذ',
 0.0005377796: 'غ',
 0.00067939673: 'آ',
 0.0007971036: 'ص',
 0.0011355467: 'ط',
 0.0014944783: 'چ',
 0.0015656585: 'ح',
 0.0019000223: 'ق',
 0.0019904638: 'ض',
 0.002163194: 'خ',
 0.0023262848: 'پ',
 0.0029464588: 'ج',
 0.0030250966: 'ش',
 0.0030842773: 'ب',
 0.0036089234: 'گ',
 0.0038250214: 'ف',
 0.0041364436: 'ع',
 0.0062509147: 'س',
 0.007384671: 'د',
 0.0076091113: 'ز',
 0.010452363: 'و',
 0.011546834: 'ک',
 0.017539252: 'ه',
 0.019256664: 'م',
 0.022767002: 'ن',
 0.026755732: 'ر',
 0.029261164: 'ل',
 0.11778881: 'ا',
 0.12442104: 'ی',
 0.25396848: 'ت',
 0.3069774: ' '}

## Get_Next_Char() Sample

In [11]:
LM.get_next_char(LM, prefix="انتخا")

'ن'

In [12]:
LM.get_next_char(LM, prefix="سلا")

'د'

In [13]:
LM.get_next_char(LM, prefix="فوتبالیس")

'ت'

## Generate_text() Sample

In [14]:
LM.generate_text(LM, prefix="انتخا",k=50)

'انتخار از ایر    بهگزارش خارههایین روز با ان اینکهاe'

In [15]:
LM.generate_text(LM, prefix="سلا",k=50)

'سلاد مرام دار در گزارشییا از مالی ازار از ایرانیه بe'

In [16]:
LM.generate_text(LM, prefix="فوتبالیس",k=50)

'فوتبالیست داد ایر در گفتگفیبه خب    محددات مارو دریe'

## get_overall_prob() Sample

In [17]:
LM.get_overall_prob(LM, sentence="علی با تاکسی به مدرسه رفت e")

-89.25869405269623

In [18]:
LM.get_overall_prob(LM, sentence="سلام آقای اصفهانی حال شما چطوره e")

-116.04135018587112

In [19]:
LM.get_overall_prob(LM, sentence="مجتبی فوتبالیست خیلی خوبی است e")

-111.31501233577728

## Evaluate

In [20]:
test_dp = DataProcessor('News/test.csv',start=0,end=40)

-------------- read_data() Done --------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


-------------- clean_text() Done --------------
Number of all characters:  11835111
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- tokenize() Done --------------
-------------- encoding() Done --------------


In [21]:
LM.evaluate(LM, test_dp.data[:100])



(9475.158883678117, 0.0)

## Calculate ALL Char

In [None]:
total_char = 0
dp1 = DataProcessor('News/train.csv',start=0,end=10)
total_char += dp1.n_chars
del dp1
dp2 = DataProcessor('News/train.csv',start=10,end=20)
total_char += dp2.n_chars
del dp2
dp3 = DataProcessor('News/train.csv',start=20,end=30)
total_char += dp3.n_chars
del dp3
dp4 = DataProcessor('News/train.csv',start=30,end=40)
total_char += dp4.n_chars
del dp4
dp5 = DataProcessor('News/train.csv',start=40,end=50)
total_char += dp5.n_chars
del dp5
dp6 = DataProcessor('News/train.csv',start=50,end=60)
total_char += dp6.n_chars
del dp6
dp7 = DataProcessor('News/train.csv',start=60,end=70)
total_char += dp7.n_chars
del dp7
dp8 = DataProcessor('News/train.csv',start=70,end=80)
total_char += dp8.n_chars
del dp8
dp9 = DataProcessor('News/train.csv',start=80,end=90)
total_char += dp9.n_chars
del dp9
dp10 = DataProcessor('News/train.csv',start=90,end=100)
total_char += dp10.n_chars



-------------- read_data() Done --------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


-------------- clean_text() Done --------------
Number of all characters:  21135596
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  15571938
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  13460989
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  13503170
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  18942407
Number of unique characters:  37
-------------- count_chars() Do

In [None]:
print('total number of characters :',total_char)
print('total number of unique characters :',dp10.n_unique_chars)

total number of characters : 156724299
total number of unique characters : 37


In [None]:
total_char = 0
dp1 = DataProcessor('News/test.csv',start=0,end=10)
total_char += dp1.n_chars
del dp1
dp2 = DataProcessor('News/test.csv',start=10,end=20)
total_char += dp2.n_chars
del dp2
dp3 = DataProcessor('News/test.csv',start=20,end=30)
total_char += dp3.n_chars
del dp3
dp4 = DataProcessor('News/test.csv',start=30,end=40)
total_char += dp4.n_chars
del dp4
dp5 = DataProcessor('News/test.csv',start=40,end=50)
total_char += dp5.n_chars
del dp5
dp6 = DataProcessor('News/test.csv',start=50,end=60)
total_char += dp6.n_chars
del dp6
dp7 = DataProcessor('News/test.csv',start=60,end=70)
total_char += dp7.n_chars
del dp7
dp8 = DataProcessor('News/test.csv',start=70,end=80)
total_char += dp8.n_chars
del dp8
dp9 = DataProcessor('News/test.csv',start=80,end=90)
total_char += dp9.n_chars
del dp9
dp10 = DataProcessor('News/test.csv',start=90,end=100)
total_char += dp10.n_chars



-------------- read_data() Done --------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


-------------- clean_text() Done --------------
Number of all characters:  3799445
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  3075899
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  2534251
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  2425516
Number of unique characters:  37
-------------- count_chars() Done --------------
-------------- read_data() Done --------------
-------------- clean_text() Done --------------
Number of all characters:  3316766
Number of unique characters:  37
-------------- count_chars() Done --

In [None]:
print('total number of characters :',total_char)
print('total number of unique characters :',dp10.n_unique_chars)

total number of characters : 28314392
total number of unique characters : 37
