# Processing Raw Text

In [1]:
# Also see:  
## http://www.nltk.org/book/ch03.html, https://docs.python.org/2/howto/urllib2.html

## Download a book from Project Gutenberg with Python:

In [14]:
from urllib2 import Request, urlopen

url="http://www.gutenberg.org/files/54255/54255-0.txt"
response = urlopen(url)
raw = response.read().decode('utf8')
#--------------------------------------------------
# Check types...
print("Type of \'response\' is %s:")% type(response)
print("Type of \'raw\' is %s:")% type(raw)

Type of 'response' is <type 'instance'>:
Type of 'raw' is <type 'unicode'>:


In [19]:
print(raw[:165])

﻿The Project Gutenberg EBook of Narrative of Travels in Europe, Asia, and
Africa, in the Seventeenth Century, Volum, by Evliya Çelebi and Joseph Hammer-Purgstall



In [82]:
from nltk import word_tokenize
t="hey, guys, how is life???!"
tt =word_tokenize(t)
print(tt)

['hey', ',', 'guys', ',', 'how', 'is', 'life', '?', '?', '?', '!']


In [83]:
ttt = pos_tag(tt)
print(ttt)

[('hey', 'NN'), (',', ','), ('guys', 'NNS'), (',', ','), ('how', 'WRB'), ('is', 'VBZ'), ('life', 'NN'), ('?', '.'), ('?', '.'), ('?', '.'), ('!', '.')]


## Tokenize and pos-tag the text:

In [24]:
from nltk import word_tokenize, pos_tag
#------------------------------
tokens = word_tokenize(raw)
print(len(tokens))
tagged=pos_tag(tokens)
print(len(tagged))

144822
144822


In [21]:
print(tokens[:50]) # list of unicode items

[u'\ufeffThe', u'Project', u'Gutenberg', u'EBook', u'of', u'Narrative', u'of', u'Travels', u'in', u'Europe', u',', u'Asia', u',', u'and', u'Africa', u',', u'in', u'the', u'Seventeenth', u'Century', u',', u'Volum', u',', u'by', u'Evliya', u'\xc7elebi', u'and', u'Joseph', u'Hammer-Purgstall', u'This', u'eBook', u'is', u'for', u'the', u'use', u'of', u'anyone', u'anywhere', u'in', u'the', u'United', u'States', u'and', u'most', u'other', u'parts', u'of', u'the', u'world', u'at']


In [25]:
print(tagged[:10]) # list of tuples (word,pos_tag pairs)

[(u'\ufeffThe', 'NN'), (u'Project', 'NNP'), (u'Gutenberg', 'NNP'), (u'EBook', 'NNP'), (u'of', 'IN'), (u'Narrative', 'NNP'), (u'of', 'IN'), (u'Travels', 'NNP'), (u'in', 'IN'), (u'Europe', 'NNP')]


In [85]:
wds=["hello", "hi", "life"]
h_wds= [w for w in wds if w.startswith("h")]


new_words=[]
for w in wds:
    if w.startswith("h"):
        new_words.append(w)
print(new_words)

['hello', 'hi']


In [87]:
pairs=[  ("Alex", "NN"), ("plays", "VBZ")    ]
verbs=[  x[0] for x in pairs  if x[1]=="VBZ"]
print(verbs)

['plays']


##### Note: The pos tagger of course makes mistakes, but it performs reasonably well.

## List comprehension on "tagged"

In [88]:
# Named enitities:
ne=[pair[0] for pair in tagged if pair[-1]=="NNP"]
for e in ne[:50]:
    print(e)

Project
Gutenberg
EBook
Narrative
Travels
Europe
Asia
Africa
Seventeenth
Century
Volum
Evliya
Çelebi
Joseph
Hammer-Purgstall
United
Project
Gutenberg
License
United
Europe
Asia
Africa
Seventeenth
Century
II
Evliya
Çelebi
Evliya
Çelebi
Joseph
Hammer-Purgstall
Release
Date
February
[
EBook
Character
***
START
THIS
PROJECT
GUTENBERG
EBOOK
NARRATIVE
OF
TRAVELS
***
Produced
Turgut


In [93]:
# Adjectives
adjs= set([pair[0] for pair in tagged if pair[-1]=="JJ"]) # we pass the list to set to uniqify
adjs= list(adjs) #Cast to list again so that we access only few in print
# Note: 'set' object has no attribute '__getitem__' and so we cannot do adjs[:15] on a set
for a in adjs[:50]:
    print(a)

breadth
remarkable
ruby-coloured
particular
tombs
gun-shot’s
yellow
rapid
mild
mile
sleep
legal
forty-six
Elephant
dish
follow
abundant
religious
washing-tubs
dreadful
seventy-seven
pardon
hunting
swam
outdated
becas
mosque
young
“Mevlúd-námeh
underwent
answered
tail
foster
obstinate
stable
suite
Precious
farsang’s
worth
orderly
virtuous
Sheikh-ul-islám
amorous
exempt
www.gutenberg.org
perishable
navigable
limpid
fat
father’s


In [37]:
# How many ne?; note these are not uniqified
print(len(ne))

15238


In [38]:
# How many uniqe adjs?
print(len(adjs))

1263


### Get collocations

In [51]:
from nltk import Text
text=Text(tokens)
#print(type(text))
text.collocations()

Project Gutenberg-tm; three hundred; hundred houses; Black Sea;
thousand men; two hundred; one hundred; great number; fifty aspers;
next day; Project Gutenberg; Uzún Hassan; three days; thousand houses;
five hours; Sultán Murad; Ahmed Páshá; Kizil Irmák; five hundred;
Mustafa Páshá


### Accessing webpages/html

In [57]:
from bs4 import BeautifulSoup
url="http://www.bbc.com/news/technology-38892383"
response = urlopen(url)
html = response.read().decode('utf8')
print(html[:200])

<!DOCTYPE html>
<html lang="en" id="responsive-news">
<head  prefix="og: http://ogp.me/ns#">
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    <title>Sh


In [65]:
raw = BeautifulSoup(html, "lxml").get_text()
tokens = word_tokenize(raw)
tok=tokens[:10]
for t in tok:
    print(t)

Shopping
robots
on
the
march
in
Ocado
-
BBC
News


### Working with unicode

In [73]:
import codecs
ara_text=codecs.open("sample_concat.tsv", "r", "utf-8").readlines()[0]
print(ara_text[:500])

2167789138	معَ فجر العام الجديد : رجوتُ إلهيَ أن يجعلني ويجعلكمِ من أسعدِ خلقهِ ، و يرزقني ويرزقكم أضعاافَ أمنيآتِكم حتَى ترضون ...صباحكم رضى||$||"@jumana_sj2: ولا اقول عاادي كمان يدخلو اغاني كوريه خنضحك #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب"||$||"@s_h_osho: نبي قناه كوريه ليش فيه قناه هنديه ومافيه كوريه؟ #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب"||$||"@LINA_ALADEEB: بعيداً عن خيالات الحب احياناً السعاده تكون عباره ع


In [74]:
def remove_unicode_diac(text):
    """Takes Arabic in utf-8 and returns same text without diac"""
    # Replace diacritics with nothing 
    text = text.replace(u"\u064B", "")# fatHatayn
    text = text.replace(u"\u064C", "") # Dammatayn
    text = text.replace(u"\u064D", "")# kasratayn
    text = text.replace(u"\u064E", "")# fatHa
    text = text.replace(u"\u064F", "") # Damma
    text = text.replace(u"\u0650", "")# kasra
    text = text.replace(u"\u0651", "")# shaddah
    text = text.replace(u"\u0652", "")# sukuun
    text = text.replace(u"\u0670", "`") # dagger 'alif
    return text

ara_text_no_diac =remove_unicode_diac(ara_text)
print(ara_text_no_diac[:500])

2167789138	مع فجر العام الجديد : رجوت إلهي أن يجعلني ويجعلكم من أسعد خلقه ، و يرزقني ويرزقكم أضعااف أمنيآتكم حتى ترضون ...صباحكم رضى||$||"@jumana_sj2: ولا اقول عاادي كمان يدخلو اغاني كوريه خنضحك #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب"||$||"@s_h_osho: نبي قناه كوريه ليش فيه قناه هنديه ومافيه كوريه؟ #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب"||$||"@LINA_ALADEEB: بعيدا عن خيالات الحب احيانا السعاده تكون عباره عن - برنامج 


In [80]:
print(type(ara_text_no_diac))

<type 'unicode'>


### Regular expressions preview!

In [75]:
import re
# This will replace the URL "http://www.bbc.com/news/technology-38892383" with a string token "<URL>"
tweet="Hey there, take a look: http://www.bbc.com/news #love_robots!"
tweet = re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', '<URL>',tweet)
print(tweet)

Hey there, take a look: <URL> #love_robots!


In [76]:
e_ending=[w for w in tweet.split() if re.search('e$', w)]
print(e_ending) # Note that "there," ends in ","

['take']


In [78]:
import string
punc = [char for char in string.punctuation]
def clean_punc(punc, text):
    for i in punc:
        text=text.replace(i, "")
    return text

tweet=clean_punc(punc, tweet)
e_ending=[w for w in tweet.split() if re.search('e$', w)]
print(e_ending) # Note that "there," ends in ","

['there', 'take']


In [79]:
print(punc)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [95]:
alldata=["hey people", "how are you?", "life is good!"]
for line_no, line in enumerate(alldata):
    print(line_no, line)

(0, 'hey people')
(1, 'how are you?')
(2, 'life is good!')


In [99]:
line="""_*0 bromwell high is a cartoon comedy .  it ran at the same time as some other programs about school life ,  such as  " teachers "  .  my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is  " teachers "  .  the scramble to survive financially ,  the insightful students who can see right through their pathetic teachers' pomp ,  the pettiness of the whole situation ,  all remind me of the schools i knew and their students .  when i saw the episode in which a student repeatedly tried to burn down the school ,  i immediately recalled  .  .  .  .  .  .  .  .  .  at  .  .  .  .  .  .  .  .  .  .  high .  a classic line :  inspector :  i'm here to sack one of your teachers .  student :  welcome to bromwell high .  i expect that many adults of my age think that bromwell high is far fetched .  what a pity that it isn't ! """
line.split()[0]
words=line.split()[1:]
print(words)

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'teachers', '"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'teachers', '"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.', '.', '.', 

In [101]:
from collections import defaultdict
space=defaultdict(int)
for w in words:
    space[w]=len(space)


In [102]:
space

defaultdict(int,
            {'!': 96,
             '"': 41,
             ',': 74,
             '.': 93,
             '35': 25,
             ':': 85,
             'a': 94,
             'about': 17,
             'adults': 88,
             'age': 89,
             'all': 59,
             'and': 64,
             'as': 22,
             'at': 76,
             'believe': 33,
             'bromwell': 91,
             'burn': 72,
             'can': 47,
             'cartoon': 4,
             'classic': 76,
             'closer': 38,
             'comedy': 5,
             'down': 73,
             'episode': 67,
             'expect': 86,
             'far': 91,
             'fetched': 92,
             'financially': 43,
             'here': 81,
             'high': 91,
             "high's": 35,
             'i': 86,
             "i'm": 80,
             'immediately': 74,
             'in': 68,
             'insightful': 44,
             'inspector': 79,
             'is': 91,
             "isn

In [105]:
import numpy as np
vec = np.zeros(len(space))
print(vec)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]


In [107]:
for w in words:
    vec[space[w]]=1
print(vec)

[ 0.  0.  0.  0.  1.  1.  0.  0.  1.  0.  0.  1.  1.  0.  1.  1.  1.  1.
  0.  1.  0.  1.  1.  0.  0.  1.  1.  0.  1.  1.  1.  0.  0.  1.  0.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  0.  1.  1.
  1.  1.  0.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.
  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.]


In [108]:
x=["a", "ab", "abc", "cd", "xxx"]
for i in x:
    if "c" in i:
        print(i)

abc
cd


In [109]:
c_list=[i for i in x if "c" in i]
print(c_list)

['abc', 'cd']
