# Advanced: Text Processing in Matrices

## Load Natural Language Toolkit for Parsing

In [1]:
! pip install nltk
import nltk

# Enter 'd' for Download, then 'punkt', and then 'q' for quit
nltk.download()


Collecting nltk
  Downloading nltk-3.2.2.tar.gz (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 1.0MB/s eta 0:00:01
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25l- \ | / done
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/42/b5/27/718985cd9719e8a44a405d264d98214c7a607fb65f3a006f28
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.2.2
[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 200

True

## Import text files into dictionary

As a "corpus" we fetched some data from Wikipedia, based on currently
trendy (2/18/2017) topics.  Each topic had multiple interpretations, some of which 
we suspected would "intersect" in interesting ways (e.g., Trump/Putin, Cloud/Google, 
Cloud/Climate).  Others had various interpretations (e.g., there are many types of 
Football).  See _Wikipedia.ipynb_ for the original download code.

Selected topics (for which the top-10 matches were returned by Wikipedia) were:

 * Pennsylvania
 * Trump
 * Apple
 * Google
 * Farm
 * Climate
 * Cloud
 * Football
 * Government
 * Putin

*docs* is a map from file --> text

In [1]:
import os

docs = {}

for filename in os.listdir('text'):
    file = open('text/' + filename)
    docs[filename] = file.read()
    print ('Loaded',filename)

print ("All files loaded")

Loaded Alpine climate.txt
Loaded American football.txt
Loaded Animal Farm.txt
Loaded Apple Corps.txt
Loaded Apple I.txt
Loaded Apple II series.txt
Loaded Apple III.txt
Loaded Apple Inc..txt
Loaded Apple Store.txt
Loaded Apple TV.txt
Loaded Apple.txt
Loaded Arcus cloud.txt
Loaded Arrest of Vladimir Putin viral video.txt
Loaded Association football.txt
Loaded AtGoogleTalks.txt
Loaded Australian rules football.txt
Loaded Brook Farm.txt
Loaded Calumet Farm.txt
Loaded Century Farm.txt
Loaded Climate change.txt
Loaded Climate classification.txt
Loaded Climate justice.txt
Loaded Climate model.txt
Loaded Climate.txt
Loaded Cloud computing.txt
Loaded CLOUD experiment.txt
Loaded Cloud.txt
Loaded College football.txt
Loaded Cooking apple.txt
Loaded Crimean speech of Vladimir Putin.txt
Loaded Cumulus cloud.txt
Loaded Desert climate.txt
Loaded Donald Trump.txt
Loaded E-government.txt
Loaded Eric Trump.txt
Loaded Family of Donald Trump.txt
Loaded Farm Aid.txt
Loaded Farm.txt
Loaded Flag football.txt

## Other preliminaries to get you started.

The function *has_letter* should be used to filter words based on the presence of a letter.

The set *stopwords* includes words to ignore.

In [7]:
import nltk
from nltk.stem.porter import *
import re
import numpy as np

"""
# Returns True if the input (string) parameter has
# any sort of letter in it, else returns False.
"""
def has_letter(x):
    return re.match('.*[a-zA-Z].*',x) != None

# Stopwords are words we will ignore for search
# purposes, because they are too common to be useful
stopwords = set()

stop_file = open('stopwords.txt')
for line in stop_file:
    stopwords.add(line.strip())

# The NLTK parser breaks apostrophe-s into a separate "word"
# so we'll want to add it to the list... Though it's technically
# not a stop word in the traditional sense.
stopwords.add("'s")

# Use this as the maximum number of words we will index
MAX_WORDS = 18220

# Your Code Goes Here!

Note that you may want to read more about TF*IDF scoring at:

* http://nlp.stanford.edu/IR-book/html/htmledition/term-frequency-and-weighting-1.html
* https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [8]:
def count_freq(word, doc):
    return doc.count(word)

In [12]:
vector = []
lexicon = {}
inverse_lexicon = {}
doclist=[]
docveclist = []
wordlist = []
def doc_vector(content, vector, lexicon, inverse_lexicon, stopwords, word_count):
    count = 0
    word_count = 0
    
    for filename, file in content.items():
        doclist.append(content[filename])
    pre_lexicon = set()
    stemmer = PorterStemmer()
    
    for doc in doclist:
        vv = []
        for word in nltk.word_tokenize(doc):
            word = word.lower()
            if (has_letter(word)):
                if (word not in stopwords):
                    pre_lexicon.add(stemmer.stem(word))
                    vv.append(stemmer.stem(word))
                    wordlist.append(word)
        docveclist.append(vv)
        

    word_count = len(lexicon)
    if word_count <= MAX_WORDS:
    
        for word in pre_lexicon:
            lexicon[word] = count
            count += 1
            
        for k, v in lexicon.items():
            inverse_lexicon[v] = k
        word_count = len(lexicon)
        
        for d in docveclist:
            doc_vector = [count_freq(word, d) for word in lexicon.keys()]
            vector.append(doc_vector)
        
    return word_count

In [13]:
doc_vector(docs, vector, lexicon, inverse_lexicon, stopwords, 0)

18220

In [14]:
import math
def compute_idf(docs, dic):
    w = [k for k in dic.keys()]
    pre_idf_vec = np.zeros(len(w))
    i = 0
    idf_vector = []
    
    for d in docs:
        for i in range(len(w)):
            if w[i] in d:
                pre_idf_vec[i] += 1
            i += 1
    pre_idf_vec = pre_idf_vec.tolist()
    
    i = 0
    for count in pre_idf_vec:
        idf_vector.append(math.log10(len(vector) / count))
        i += 1
        
    return idf_vector

In [15]:
idf = compute_idf(docveclist, lexicon)

In [16]:
from scipy import linalg, mat, dot
def cosine_similarity(d_j, q):
    a = np.dot(np.asarray(d_j), q)
    b = linalg.norm(np.asarray(d_j)) * linalg.norm(q)
    return a / b

In [17]:
def create_query_vector(query):
    pre_querylist = []
    query_list = []
    stemmer = PorterStemmer()
    
    w = [k for k in lexicon.keys()]
    query_vector = np.zeros(len(w))
    
    for word in query.split(): 
        pre_querylist.append(word)
    
    for word in pre_querylist:
        word = word.lower()
        if (word not in stopwords):
            if (has_letter(word)):
                query_list.append(stemmer.stem(word))
    
    for i in range(len(query_list)):
        for j in range(len(w)):
            if w[j] == query_list[i]:
                query_vector[j] = 1
                
    return query_vector

In [18]:
import pandas as pd
def search(vector, idf, query, num_result):
    d = {}
    dl = []
    query_vec = np.multiply(query, idf)
    docname = [k for k in docs.keys()]

    for i in range(len(vector)):
        d['doc'] = i
        d['score'] = cosine_similarity(vector[i], query_vec)
        d['docname'] = docname[i]
        dl.append(d.copy())
    df = pd.DataFrame.from_dict(dl).sort_values(['score'], ascending = False)
    df = df.head(num_result)
    
    return df

# Step 5

In [19]:
query_vec = create_query_vector('Apple Steve jobs')
search(vector, idf, query_vec, 10)

Unnamed: 0,doc,docname,score
24,24,Apple Inc..txt,0.479183
50,50,Apple I.txt,0.457664
47,47,Apple III.txt,0.41615
84,84,Apple.txt,0.396741
8,8,Cooking apple.txt,0.378142
73,73,Apple II series.txt,0.367678
76,76,Apple Corps.txt,0.365709
3,3,Apple Store.txt,0.351709
60,60,Apple TV.txt,0.3244
33,33,Home Farm F.C..txt,0.015758


In [22]:
query_vec = create_query_vector('Trump Putin')
search(vector, idf, query_vec, 5)

Unnamed: 0,doc,docname,score
91,91,Donald Trump.txt,0.612954
7,7,Legal affairs of Donald Trump.txt,0.608079
30,30,Family of Donald Trump.txt,0.581884
88,88,The Trump Organization.txt,0.578828
72,72,Public image of Vladimir Putin.txt,0.57224


In [21]:
query_vec = create_query_vector('Google Cloud')
search(vector, idf, query_vec, 5)

Unnamed: 0,doc,docname,score
34,34,Google.txt,0.622013
45,45,Arcus cloud.txt,0.553243
35,35,Stratus cloud.txt,0.549608
20,20,Google Talk.txt,0.526714
19,19,Cumulus cloud.txt,0.525371
