# Lab 16: Mining data from Twitter

In [3]:
# Unicode Handling
from __future__ import unicode_literals
import codecs

import numpy as np
import gensim

# spacy is used for pre-processing and traditional NLP
import spacy
from spacy.en import English

# Gensim is used for LDA and word2vec
from gensim.models import Word2Vec

## Processing the tweets we pulled from Twitter

In [20]:
# Run `capture-tweets.py` in the parent folder before running this cell.

# Loading the tweet data
filename = '../../assets/dataset/news-tweets.txt'

tweets = []
for tweet in codecs.open(filename, 'r', encoding="utf-8"):
    tweets.append(tweet)

# Setting up spacy
nlp_toolkit = English()

In [6]:
tweets_word2vec = Word2Vec(size=300, min_count=10)
tweets_word2vec.build_vocab(tweets)

#Train the model over train_reviews (this may take several minutes)
tweets_word2vec.train(tweets)

709647

In [9]:
tweets_word2vec.vocab

{u'\n': <gensim.models.word2vec.Vocab at 0x111cdd290>,
 u'\r': <gensim.models.word2vec.Vocab at 0x111cdd910>,
 u' ': <gensim.models.word2vec.Vocab at 0x111ccd5d0>,
 u'!': <gensim.models.word2vec.Vocab at 0x111cdd9d0>,
 u'"': <gensim.models.word2vec.Vocab at 0x10aaa54d0>,
 u'#': <gensim.models.word2vec.Vocab at 0x111ccdc90>,
 u'$': <gensim.models.word2vec.Vocab at 0x111ccd650>,
 u'%': <gensim.models.word2vec.Vocab at 0x111cddd90>,
 u'&': <gensim.models.word2vec.Vocab at 0x111ccd990>,
 u"'": <gensim.models.word2vec.Vocab at 0x111ccdd50>,
 u'(': <gensim.models.word2vec.Vocab at 0x111ccd690>,
 u')': <gensim.models.word2vec.Vocab at 0x111cdda10>,
 u'*': <gensim.models.word2vec.Vocab at 0x111cdd2d0>,
 u'+': <gensim.models.word2vec.Vocab at 0x111ccdd90>,
 u',': <gensim.models.word2vec.Vocab at 0x111ccd6d0>,
 u'-': <gensim.models.word2vec.Vocab at 0x10bca69d0>,
 u'.': <gensim.models.word2vec.Vocab at 0x111cdd310>,
 u'/': <gensim.models.word2vec.Vocab at 0x111ccddd0>,
 u'0': <gensim.models.word

In [18]:
model = Word2Vec("".join(tweets), size=100, window=5, min_count=5, workers=4)

model.most_similar("Morning")

KeyError: u"word 'Morning' not in vocabulary"

In [19]:
model.vocab

{u'\n': <gensim.models.word2vec.Vocab at 0x118584790>,
 u'\r': <gensim.models.word2vec.Vocab at 0x1185a62d0>,
 u' ': <gensim.models.word2vec.Vocab at 0x11859f0d0>,
 u'!': <gensim.models.word2vec.Vocab at 0x1185a6490>,
 u'"': <gensim.models.word2vec.Vocab at 0x11266c6d0>,
 u'#': <gensim.models.word2vec.Vocab at 0x112664650>,
 u'$': <gensim.models.word2vec.Vocab at 0x11859fd50>,
 u'%': <gensim.models.word2vec.Vocab at 0x1185a6910>,
 u'&': <gensim.models.word2vec.Vocab at 0x112664e50>,
 u"'": <gensim.models.word2vec.Vocab at 0x11266cb10>,
 u'(': <gensim.models.word2vec.Vocab at 0x11859fd10>,
 u')': <gensim.models.word2vec.Vocab at 0x1185a64d0>,
 u'*': <gensim.models.word2vec.Vocab at 0x118584990>,
 u'+': <gensim.models.word2vec.Vocab at 0x11266cad0>,
 u',': <gensim.models.word2vec.Vocab at 0x11859fcd0>,
 u'-': <gensim.models.word2vec.Vocab at 0x11859f090>,
 u'.': <gensim.models.word2vec.Vocab at 0x1185849d0>,
 u'/': <gensim.models.word2vec.Vocab at 0x11266cb90>,
 u'0': <gensim.models.word

## Exercise 1a

Write a function that can take a take a sentence parsed by `spacy` and identify if it mentions a company named 'Google'. Remember, `spacy` can find entities and codes them as `ORG` if they are a company. Look at the slides for class 13 if you need a hint:

### Bonus (1b)

Parameterize the company name so that the function works for any company.

In [None]:
def mentions_company(parsed):
    # Return True if the sentence contains an organization and that organization is Google
    for entity in parsed.ents:
        # Fill in code here
    # Otherwise return False
    return False

# 1b

def mentions_company(parsed, company='Google'):
    # Your code here
    pass

## Exercise 1c

Write a function that can take a sentence parsed by `spacy` 
and return the verbs of the sentence (preferably lemmatized)

In [None]:
def get_actions(parsed):
    actions = []
    # Your code here
    return actions

## Exercise 1d
For each tweet, parse it using spacy and print it out if the tweet has 'release' or 'announce' as a verb. You'll need to use your `mentions_company` and `get_actions` functions.

In [None]:
for tweet in tweets:
    parsed = nlp_toolkit(tweet)
    pass


## Exercise 1e
Write a function that identifies countries - HINT: the entity label for countries is GPE (or GeoPolitical Entity)



In [None]:
def mentions_country(parsed, country):
    pass


## Exercise 1f

Re-run (d) to find country tweets that discuss 'Iran' announcing or releasing.


In [None]:
for tweet in tweets:
    parsed = nlp_toolkit(tweet)
    pass

## Exercise 2
Build a `word2vec` model of the tweets we have collected using `gensim`.

### Exercise 2a:
First take the collection of tweets and tokenize them using spacy.

* Think about how this should be done. 
* Should you only use upper-case or lower-case? 
* Should you remove punctuations or symbols? 

In [None]:
text_split = [[x.text if x.pos != spacy.parts_of_speech.VERB else x.lemma_ 
                for x in nlp_toolkit(t)] for t in tweets]


### Exercise 2b:
Build a `word2vec` model.
Test the window size as well - this is how many surrounding words need to be used to model a word. What do you think is appropriate for Twitter? 

In [None]:
model = Word2Vec(text_split, size=100, window=4, min_count=5, workers=4)

### Exercise 2c:
Test your word2vec model with a few similarity functions. 
* Find words similar to 'Syria'.
* Find words similar to 'war'.
* Find words similar to "Iran".
* Find words similar to 'Verizon'. 



In [None]:
model.most_similar(positive=['Syria'])

# Exercise 2d

Adjust the choices / parameters in (b) and (c) as necessary.


## Exercise 3

Filter tweets to those that mention 'Iran' or similar entities and 'war' or similar entities.
* Do this using just spacy.
* Do this using word2vec similarity scores.

In [None]:
# Using spacy
for tweet in tweets:
    parsed = nlp_toolkit(tweet)
    pass

In [None]:
# Using word2vec similarity scores
for tweet in tweets[:200]:
    parsed = nlp_toolkit(tweet)
    pass
