In [1]:
import nltk
from nltk.util import ngrams
from nltk.corpus import movie_reviews

from textblob import TextBlob

import pandas as pd
from collections import defaultdict
from operator import itemgetter

### here we are will look into a list of documents in terms of their text

In [2]:
fileids = movie_reviews.fileids()[:5]
doc_words = [movie_reviews.words(fileid) for fileid in fileids]
doc_words

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...],
 ['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...],
 ['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...],
 ['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...],
 ['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...]]

In [3]:
documents = [' '.join(words) for words in doc_words]
# we are getting 5 reviews, each document is a list of words (hence 0)
# and then get first 20 words.
print(documents[:1][0][:100])

plot : two teen couples go to a church party , drink and then drive . they get into an accident . on


### How about getting rid of punctuation

In [4]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [5]:
for doc in documents:
    print(doc)
    print('\n')

plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what ' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it ' s simply too jumbled . it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no idea

In [6]:
from textblob import TextBlob

words = TextBlob(documents[0]).words
words

WordList(['plot', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', 'drink', 'and', 'then', 'drive', 'they', 'get', 'into', 'an', 'accident', 'one', 'of', 'the', 'guys', 'dies', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', 'and', 'has', 'nightmares', 'what', 's', 'the', 'deal', 'watch', 'the', 'movie', 'and', 'sorta', 'find', 'out', 'critique', 'a', 'mind', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', 'mess', 'with', 'your', 'head', 'and', 'such', 'lost', 'highway', 'memento', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of', 'making', 'all', 'types', 'of', 'films', 'and', 'these', 'folks', 'just', 'didn', 't', 'snag'

# Prelim cleanup

In [7]:
# Here is an example of taking a single sentence (listOfLines[0]) and removing the punctuations
import string

strX = listOfLines[0].replace(string.punctuation, '')

for i in string.punctuation:
    strX = strX.replace(i, '')
    
strX

NameError: name 'listOfLines' is not defined

In [8]:
# Better way is to remove punctuation from whole document (except .) and then split by .
strPunctuation = string.punctuation.replace('.', '')

strPunctuation # this contains all punctuation except .
document1 = documents[0]

for punct in strPunctuation:
    document1 = document1.replace(punct, '')
    
# Lets split the document
document1 = document1.split('.')

document1 = filter(lambda x: len(x)>1, document1) # get rid of any string that has length greater than 1
document1 = map(lambda x: x.strip(), document1) # strip away all empty strings at the ends

# section below 
# for i in range(3):
#     print(next(document1))
#     print('\n')
doc = next(document1)

In [9]:

docList = list(filter(lambda x: len(x)>0, doc.split(' ')))
docList

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 'drink',
 'and',
 'then',
 'drive']

In [12]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
    return zip(*[input_list[i:] for i in range(n)])

list(find_ngrams(input_list, 3))

[('all', 'this', 'happened'),
 ('this', 'happened', 'more'),
 ('happened', 'more', 'or'),
 ('more', 'or', 'less')]

In [16]:
d = defaultdict(list)

for trigrams in find_ngrams(input_list,3):
    word1, word2, word3 = trigrams
    d[(word1, word2)].append(word3)
    
d

defaultdict(list,
            {('all', 'this'): ['happened'],
             ('happened', 'more'): ['or'],
             ('more', 'or'): ['less'],
             ('this', 'happened'): ['more']})

#### how about using some text

In [4]:
import numpy as np
import random
import string
from collections import defaultdict

# ======= (1) open a single file and upload its string into documents

speech = []
with open('obamaSpeech.txt', 'r') as f:
    # sppech is not a list where each element is a new line
    speech.extend(f.readlines())
    
document = ''.join(speech)

# ======= (2) remove all punctuation except '.' which could be used to split lines later 

strPunctuation = string.punctuation.replace('.', '').replace("'", '')

# remove all punctuation
for punct in strPunctuation:
    document = document.replace(punct, ' ')

# remove newLines, tabs and single quote
for i in ["\t","\n", "'"]:
    document = document.replace(i, '')

# ======= (3) Now split by '.'

docList = document.split('.')
docList = filter(lambda x: len(x)>1, docList) # get rid of any string that has length greater than 1
docList = map(lambda x: x.strip(), docList) # strip away all empty strings at the ends
docList = list(docList)

# ======= (4) Start doing n-grams

def find_ngrams(input_string, n):
    # replace double space by single space
    input_string = input_string.replace('  ', ' ')
    input_list = input_string.split(' ')
    return zip(*[input_list[i:] for i in range(n)])

# ======= (5) Start doing n-grams

def generateNGramDict(docList):
    d = defaultdict(list)
    for doc in docList:
        trigrams = list(find_ngrams(doc,3))
        for trigram in trigrams:
            d[trigram[:2]].append(trigram[2])
            
    return d

dtemp = generateNGramDict(docList)

# ======= (6) Test sentence generator

def generateText(triGramDict, numOfLoops, firstWord='getting', secondWord='the'):
    newSpeech = [firstWord,secondWord]
    counter = 0
    
    while counter < numOfLoops:
        try:
            firstWord, secondWord = secondWord, np.random.choice(triGramDict[(firstWord,secondWord)])
            #print(newSpeech)
            newSpeech.append(secondWord)
        except:
            break
        
    return ' '.join(newSpeech)

generateText(dtemp, 10, 'I', 'want')
#docList    
#dtemp[('getting','the')]

'I want to recruit 100 000 new math and science teachers so we could reduce our deficit but we do it alone Parents have to fight for you and God bless you and God bless you and your families every single day as hard as I know how'

In [5]:
trigrams = list(find_ngrams(docList[5],3))
trigrams

[('I', 'cannot', 'imagine'),
 ('cannot', 'imagine', 'not'),
 ('imagine', 'not', 'being'),
 ('not', 'being', 'fired'),
 ('being', 'fired', 'up'),
 ('fired', 'up', 'after'),
 ('up', 'after', 'listening'),
 ('after', 'listening', 'to'),
 ('listening', 'to', 'Bruce'),
 ('to', 'Bruce', 'Springsteen')]

In [7]:
d = defaultdict(list)

for trigram in trigrams:
    d[trigram[:2]].append(trigram[2])
    
d

defaultdict(list,
            {('I', 'cannot'): ['imagine'],
             ('after', 'listening'): ['to'],
             ('being', 'fired'): ['up'],
             ('cannot', 'imagine'): ['not'],
             ('fired', 'up'): ['after'],
             ('imagine', 'not'): ['being'],
             ('listening', 'to'): ['Bruce'],
             ('not', 'being'): ['fired'],
             ('to', 'Bruce'): ['Springsteen'],
             ('up', 'after'): ['listening']})

In [219]:
# code taken from: https://github.com/heaven00/skipgram/blob/master/skipgram.py

import timeit


# def skipgram_ndarray(sent, k=1, n=2):
#     """
#     This is not exactly a vectorized version, because we are still
#     using a for loop
#     """
#     tokens = sent.split()
#     if len(tokens) < k + 2:
#         raise Exception("REQ: length of sentence > skip + 2")
#     matrix = np.zeros((len(tokens), k + 2), dtype=object)
#     matrix[:, 0] = tokens
#     matrix[:, 1] = tokens[1:] + ['']
#     result = []
#     for skip in range(1, k + 1):
#         matrix[:, skip + 1] = tokens[skip + 1:] + [''] * (skip + 1)
#     for index in range(1, k + 2):
#         temp = matrix[:, 0] + ',' + matrix[:, index]
#         map(result.append, temp.tolist())
#     limit = (((k + 1) * (k + 2)) / 6) * ((3 * n) - (2 * k) - 6)
#     return result[:limit]


def skipgram_list(sent, k=1, n=2):
    """
    Form skipgram features using list comprehensions
    """
    tokens = sent.split()
    tokens_n = ['''tokens[index + j + {0}]'''.format(index) for index in range(n - 1)]
    x = '(tokens[index], ' + ', '.join(tokens_n) + ')'
    query_part1 = 'result = [' + x + ' for index in range(len(tokens))'
    query_part2 = ' for j in range(1, k+2) if index + j + n < len(tokens)]'
    exec(query_part1 + query_part2)
    return result

if __name__ == "__main__":
    text = """Pretty awesome except for the mate choice data.
            Yes, in all cultures studied men ranked appearance higher than
            women did, and women ranked ambition higher than men did.
            All cultures studied also contained within them the fact
            that women do not have equal economic opportunities,
            so of course a partner with earning power is important.
            So culture is playing a part here, but all cultures studied
            had the similar economic inequities so of course produced
            this difference. *When women make their own money, their
            desire for a good-looking partner, even a younger partner,
            increases* and I'm willing to bet if we analyzed the raw
            data we'd see less stark differences between men and women
            in this realm in cultures with lesser degrees of economic
            inequality.Another interesting point about that study
            (I believe it was David Buss's surveys from the 1990s)
            is that while men ranked beauty higher than women,
            women ranked ambition higher than men -- all people,
            male and female, in all cultures, ranked intelligence
            and kindness and their top requirements in a mate"""
    print("```````````````````````````````````````````````````````````````")
#     loops_list = []
#     timer_lc = []
#     timer_ar = []
#     for index in range(3):
#         loops = 100 * 10 ** index
#         loops_list.append(loops)
#         timer_lc.append(timeit.timeit(lambda: skipgram_list(text, 1, 2), number=loops))
#         timer_ar.append(timeit.timeit(lambda: skipgram_ndarray(text, 1, 2),number=loops))
#     print("Loops, List comprehensions, ndarray\n")
#     for index in range(len(loops_list)):
#         print("{0}, {1}, {2}".format(loops_list[index], timer_lc[index],timer_ar[index]))
    skipgram_list(text, 1, 2)

```````````````````````````````````````````````````````````````


NameError: name 'k' is not defined

In [10]:
str = 'To be or not to be that is the question'

def find_ngrams_with_skip(input_string, skip, n):
    

list(find_ngrams_with_skip(str, 1, 3))

[('or', 'not', 'to'),
 ('not', 'to', 'be'),
 ('to', 'be', 'that'),
 ('be', 'that', 'is'),
 ('that', 'is', 'the'),
 ('is', 'the', 'question')]

In [None]:
st