# Concordance

The aim of this effort is to leverage Tamara's initial concordance exploration and find patterns for terms related to lunatic asylums. Some of this code has been borrowed from her notebook.

In [43]:
# to access files for cleaning
import os, ssl 
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context

# Libraries for data loading
import pandas as pd  # For dataframe analysis
import numpy as np  # For dataframe analysis
import string
import re  # Regix to remove punctuation from strings I split

# Libraries for visualization
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

# Libraries for text analysis
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
from nltk.corpus import PlaintextCorpusReader
nltk.download('wordnet')
from nltk.corpus import wordnet
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.text import Text
from nltk.stem.porter import PorterStemmer
from nltk.probability import FreqDist
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
nltk.download('tagsets')  # part of speech tags
from nltk.draw.dispersion import dispersion_plot as displt


#Libraries for advanced text processing
from nltk.util import ngrams
from collections import Counter
nltk.download('averaged_perceptron_tagger')
from shutil import copyfile # For copying clean files
from sklearn.feature_extraction.text import CountVectorizer # For creating document-term matrix & excluding stop words
from sklearn.feature_extraction import text # For getting stop words
from wordcloud import WordCloud # For creating word clouds
from textblob import TextBlob # For sentiment analysis

[nltk_data] Downloading package punkt to /Users/tashfeen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tashfeen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tashfeen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tashfeen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/tashfeen/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tashfeen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
!pip install altair

Collecting altair
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
[K     |████████████████████████████████| 727 kB 4.2 MB/s eta 0:00:01
Installing collected packages: altair
Successfully installed altair-4.1.0


In [4]:
pwd

'/Users/tashfeen/Documents/Courses/dwdProj/dwd-british-india-papers'

In [8]:
# create a corpus from all texts and tokenize
corpus_folder = '/Users/tashfeen/Documents/Courses/dwdProj/dwd-british-india-papers/d'
wordlists = PlaintextCorpusReader(corpus_folder, '\d.*', encoding='latin1')
corpus_tokens = wordlists.words()
print(corpus_tokens[:30])

['No', '.', '1111', '(', 'Sanitary', '),', 'dated', 'Ootacamund', ',', 'the', '6th', 'October', '1876', '.', 'From', '-', 'The', 'Honourable', 'W', '.', 'HUDLESTON', ',', 'Chief', 'Secretary', 'to', 'the', 'Govern', '-', 'ment', 'of']


In [16]:
# what are the words around 'Lunatic'
t = Text(corpus_tokens)
t.concordance('Lunatic Asylum', width=110, lines=50)  # by default NLTK's concordance method displays 25 lines

no matches


In [42]:
# reconfigured solution from https://stackoverflow.com/questions/33813405/concordance-for-a-phrase-using-nltk-in-python
def n_concordance_tokenised(text,phrase,left_margin=5,right_margin=5):
    
    phraseList=phrase.split(' ')

    c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower())
    
    offsets=[c.offsets(x) for x in phraseList]
    offsets_norm=[]
    
    for i in range(len(phraseList)):
        offsets_norm.append([x-i for x in offsets[i]])
        
    intersects=set(offsets_norm[0]).intersection(*offsets_norm[1:])
    
    concordance_txt = ([text.tokens[list(map(lambda x: x-left_margin if (x-left_margin) > 0 else 0,[offset]))[0]:offset+len(phraseList)+right_margin]
                    for offset in intersects])

    outputs=[''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
    return outputs

def n_concordance(txt,phrase,left_margin=5,right_margin=5):
    tokens = nltk.word_tokenize(txt)
    text = nltk.Text(tokens)
    return

In [35]:
textList = []
for dirname, _, filenames in os.walk('./d'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        myfile = os.path.join(dirname, filename)
        with open(myfile, 'rb') as fopen:
            q = fopen.read().decode('ISO-8859-1')
            textList.append(q)

In [36]:
raw_text = " ".join(textList)

In [37]:
tokens = nltk.word_tokenize(raw_text)
text = nltk.Text(tokens)

In [41]:
n_concordance_tokenised(text,'Lunatic Asylum')

['PATIENTS ADMITTED INTO THE PUNJAB LUNATIC ASYLUM , LAHORE , DURING THE ',
 'January 1875 . Superintendent , Lunatic Asylum . ( 39 ) No ',
 ', Surgeon-Major , Superintendent , Lunatic Asylum . ( 22 ) No ',
 'of lunatics in the Provincial Lunatic Asylum at Tezpur during the year ',
 'account of Manufactures in the Lunatic Asylum in the Central Provinces during ',
 'Labor of the Lunatics in Lunatic Asylum , at Cuttack for the ',
 '7G Temporary Establishment ( Calicut Lunatic Asylum ) . Third-class Male Attendants ',
 'my annual report on the lunatic asylum at Dacca for the year ',
 '. X.NOTE ON THE RANGOON LUNATIC ASYLUM FOR THE YEAR 1906 RANGOON ',
 ', Surgeon-Major , Superintendent , Lunatic Asylum . ( 5 ) No ',
 'PATIENTS ADMITTED INTO THE PUNJAB LUNATIC ASYLUM , LAHORE , DURING THE ',
 'Criminal Lunatics in the Delhi Lunatic Asylum for the year 1883 and ',
 '. The Sir Cowasji Jehángir Lunatic Asylum at Hyderabad was inspected by ',
 '5 84 2 0 Dacca Lunatic Asylum . 1,732 15 0 ... ',

In [None]:
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
 
print("before", nlp.pipe_names)
 
if "WordnetAnnotator" not in nlp.pipe_names:
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after="tagger")
 
print("after", nlp.pipe_names)