<a href="https://colab.research.google.com/github/modhudeb/Named-Entity-Recognition-LSTM/blob/main/text_entity_p1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import os
import bz2
import shutil
import re
from zipfile import ZipFile

In [None]:
import pandas as pd
import numpy as np


# Loading data

In [None]:
os.chdir('drive/MyDrive/data/')

In [None]:

input_file = 'aij-wikiner-en-wp3.bz2'
output_file = 'aij-wikiner-en-wp3.csv'

with bz2.open(input_file, 'rb') as compressed_file:
    with open(output_file, 'wb') as decompressed_file:
        shutil.copyfileobj(compressed_file, decompressed_file)

In [None]:
data = pd.read_csv('aij-wikiner-en-wp3.csv', delimiter='\t', header=None, names=['word'])
data.sample(3)

Unnamed: 0,word
61865,The|DT|O country|NN|O has|VBZ|O a|DT|O strong|...
48291,Some|DT|O of|IN|O the|DT|O computer-generated|...
68047,On|IN|O the|DT|O top|NN|O of|IN|O the|DT|O Byr...


In [None]:
def cleanTxt(X):
  x = """{}""".format(X)
  X = X.replace("\n", " ")
  X = re.sub(r'\s+', ' ', X)
  return X

data['word'] = data['word'].apply(cleanTxt)

In [None]:
data['word'][2]

'From|IN|O this|DT|O climate|NN|O William|NNP|I-PER Godwin|NNP|I-PER developed|VBD|O what|WP|O many|NN|O consider|VBP|O the|DT|O first|JJ|O expression|NN|O of|IN|O modern|JJ|O anarchist|NN|O thought|NN|O .|.|O'

In [None]:
x = data['word'][2].split(' ')
x

['From|IN|O',
 'this|DT|O',
 'climate|NN|O',
 'William|NNP|I-PER',
 'Godwin|NNP|I-PER',
 'developed|VBD|O',
 'what|WP|O',
 'many|NN|O',
 'consider|VBP|O',
 'the|DT|O',
 'first|JJ|O',
 'expression|NN|O',
 'of|IN|O',
 'modern|JJ|O',
 'anarchist|NN|O',
 'thought|NN|O',
 '.|.|O']

In [None]:
x[2].split("|")

['climate', 'NN', 'O']

In [None]:
dat = []
err = 0
for x in range(len(data)):
  temp = data['word'][x].split(" ")
  for w in temp:
    try:
      w = w.split("|")
      dat.append([x] + w)
    except:
      err+=1
      continue
print("Total errors : ", err)
print('Done')

Total errors :  0
Done


In [None]:
dat[0:5]

[[0, 'The', 'DT', 'I-MISC'],
 [0, 'Oxford', 'NNP', 'I-MISC'],
 [0, 'Companion', 'NNP', 'I-MISC'],
 [0, 'to', 'TO', 'I-MISC'],
 [0, 'Philosophy', 'NNP', 'I-MISC']]

In [None]:
df = pd.DataFrame(data = dat, columns=['sentences', 'words', 'POS', 'tags'])

In [None]:
df.to_csv('Wiki_Cleaned_NER.csv', index = False)

# Working on another dataset

In [None]:
# Unzipping the file NER.zip
with ZipFile('NER.zip') as zObj:
  zObj.extractall()

In [None]:
os.listdir()

['.ipynb_checkpoints',
 'files',
 'CropDiseaseApp',
 'AnimalsDataset.zip',
 'AnimalsDataset',
 'aij-wikiner-en-wp3.bz2',
 'Wiki_Cleaned_NER.csv',
 'NER.zip',
 'ner_data.csv']

In [None]:
# loading the data
df = pd.read_csv('ner_data.csv', encoding = 'latin1')
df.head(2)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O


In [None]:
df.columns = ['sentences', 'words', 'POS', 'tags']

In [None]:
df.isna().sum()

sentences    1000616
words              0
POS                0
tags               0
dtype: int64

In [None]:
df['sentences'] = df['sentences'].ffill(axis = 0)

In [None]:
def cleanFunc(X):
  X = X.replace("Sentence: ", '')
  return(int(X))

In [None]:
df['sentences'] = df['sentences'].apply(cleanFunc)

In [None]:
df

Unnamed: 0,sentences,words,POS,tags
0,1,Thousands,NNS,O
1,1,of,IN,O
2,1,demonstrators,NNS,O
3,1,have,VBP,O
4,1,marched,VBN,O
...,...,...,...,...
1048570,47959,they,PRP,O
1048571,47959,responded,VBD,O
1048572,47959,to,TO,O
1048573,47959,the,DT,O


#### We will merge two datasets now

In [None]:
#
data = pd.read_csv('Wiki_Cleaned_NER.csv')

In [None]:
data.head(2)

Unnamed: 0,sentences,words,POS,tags
0,0,The,DT,I-MISC
1,0,Oxford,NNP,I-MISC


In [None]:
data.loc[len(data)-1]

sentences    141386
words             .
POS               .
tags              O
Name: 3499605, dtype: object

In [None]:
# We need to fix the sentences count of 'df' dataframe, cause we will append it to 'data' dataframe.
df['sentences'] = df['sentences'].apply(lambda x : 141386 + x )

In [None]:
df.head(2)

Unnamed: 0,sentences,words,POS,tags
0,141387,Thousands,NNS,O
1,141387,of,IN,O


In [None]:
data = pd.concat([data, df], ignore_index=True)

In [None]:
data.to_csv('merged_wiki_ner.csv', index = False)