# Lesson 14 - Solutions

In [1]:
# Unicode Handling
from __future__ import unicode_literals
import codecs

import numpy as np
import gensim

# spacy is used for pre-processing and traditional NLP
import spacy
from spacy.en import English

# Gensim is used for LDA and word2vec
from gensim.models.word2vec import Word2Vec

In [2]:
# Loading the tweet data
filename = "../../../../data/captured-tweets.txt"
tweets = []
for tweet in codecs.open(filename, "r", encoding = "utf-8"):
    tweets.append(tweet)
# Setting up spacy
nlp_toolkit = English()

## Exercise 1a
Write a function that can take a sentence parsed by `spacy` and identify if it mentions a company named "Google". Remember, `spacy` can find entities and codes them as `ORG` if they are a company. Look at the slides for class 13 if you need a hint:

### Bonus (1b)
Parameterise the company name so that the function works for any company.

In [3]:
# 1a
def mentions_company(parsed):
    # Return True if the sentence contains an organization and that organization is Google
    for entity in parsed.ents:
        if entity.text == "Google" and entity.label_ == "ORG":
            return True
    # Otherwise return False
    return False

# 1b
def mentions_company(parsed, company = "Google"):
    for entity in parsed.ents:
        if entity.text == company and entity.label_ == "ORG":
            return True
    return False

## Exercise 1c
Write a function that can take a sentence parsed by `spacy` and return the verbs of the sentence (preferably lemmatised)

In [4]:
def get_actions(parsed):
    actions = []
    for el in parsed:
        if el.pos == spacy.parts_of_speech.VERB:
            actions.append(el.text)
    return actions

## Exercise 1d
For each tweet, parse it using spacy and print it out if the tweet has "release" or "announce" as a verb. You will need to use your `mentions_company` and `get_actions` functions.

In [5]:
for tweet in tweets:
    parsed = nlp_toolkit(tweet)
    if mentions_company(parsed, "Google"):
        actions = get_actions(parsed)
        if "release" in actions or "announce" in actions:
            print(tweet)

Google &amp; Ford rumored to announce partnership at CES https://t.co/zOgm1NjHhD https://t.co/Gzx81ujqVC

Google and Ford to announce partnership on self-driving cars at CES - Fudzilla (blog) https://t.co/6woe56G22Q

Google and Ford to announce partnership on self-driving cars at CES - Fudzilla (blog) https://t.co/4hERVJ4zZK



## Exercise 1e
Write a function that identifies countries - HINT: the entity label for countries is GPE (or GeoPolitical Entity)

In [6]:
def mentions_country(parsed, country):
    for entity in parsed.ents:
        if entity.text == country and entity.label_ == "GPE":
            return True
    return False

## Exercise 1f
Re-run (d) to find country tweets that discuss "Iran" announcing or releasing.

In [7]:
for tweet in tweets:
    parsed = nlp_toolkit(tweet)

    if mentions_country(parsed, "Iran"):
        actions = get_actions(parsed)
        if "release" in actions or "announce" in actions:
            print(tweet)

RT @cerenomri: "Literally every US ally in Mideast is on brink of hot war w/ Iran, so we're going to release $100 billion to Iran this mont…

GOBE! Iran warns Nigeria to release Shiite leader El-Zakzaky - SEE https://t.co/TRshnC6sVU

GOBE! Iran warns Nigeria to release Shiite leader El-Zakzaky - SEE https://t.co/SlvcQtk3vE

RT @cerenomri: "Literally every US ally in Mideast is on brink of hot war w/ Iran, so we're going to release $100 billion to Iran this mont…

Hhmmm. Iran claiming to have 'warned Nigeria' to release detained Shiite leader.... @afalli

RT @cerenomri: "Literally every US ally in Mideast is on brink of hot war w/ Iran, so we're going to release $100 billion to Iran this mont…



## Exercise 2
- Build a word2vec model of the tweets we have collected using `gensim`.
- First take the collection of tweets and tokenise them using `spacy`.

### Exercise 2a
- Think about how this should be done. 
- Should you only use upper-case or lower-case? 
- Should you remove punctuations or symbols? 

In [8]:
text_split = [[x.text if x.pos != spacy.parts_of_speech.VERB else x.lemma_ 
                for x in nlp_toolkit(t)] for t in tweets]

### Exercise 2b
- Build a word2vec model
- Test the window size as well - this is how many surrounding words need to be used to model a word. What do you think is appropriate for Twitter? 

In [9]:
model = Word2Vec(text_split, size = 100, window = 4, min_count = 5, workers = 4)

### Exercise 2c
Test your word2vec model with a few similarity functions
- Find words similar to "Syria"
- Find words similar to "war"
- Find words similar to "Iran"
- Find words similar to "Verizon"

In [15]:
model.most_similar(positive = ["Syria"])

[(u'opposition', 0.9993744492530823),
 (u'Paris', 0.9987044334411621),
 (u"'s", 0.9986689686775208),
 (u'casualties', 0.9985477924346924),
 (u'StopExecutionsIran', 0.9985381364822388),
 (u'UK', 0.9984525442123413),
 (u'defeat', 0.9983157515525818),
 (u'SaudiArabia', 0.9981868267059326),
 (u'+', 0.998157799243927),
 (u'Service', 0.9981530904769897)]

In [14]:
model.most_similar(positive = ["Iran"])

[(u'regime', 0.9978275895118713),
 (u'democratic', 0.9953438639640808),
 (u'opposition', 0.9948534965515137),
 (u'News', 0.994827389717102),
 (u'France', 0.9946572780609131),
 (u'Syria', 0.994074285030365),
 (u'No2Rouhani', 0.9938837289810181),
 (u'Reason', 0.9928948283195496),
 (u'Paris', 0.9928804039955139),
 (u'@lemondefr', 0.9923374056816101)]

In [13]:
model.most_similar(positive = ["Verizon"])

[(u'3', 0.9997420310974121),
 (u'iPhone', 0.9997348785400391),
 (u'Black', 0.9997272491455078),
 (u'follow', 0.9997243881225586),
 (u'2', 0.9997117519378662),
 (u'With', 0.999711275100708),
 (u'Xbox', 0.9996981620788574),
 (u'Israel', 0.9996776580810547),
 (u'after', 0.999665379524231),
 (u'Microsoft', 0.9996631741523743)]

### Exercise 2d
Adjust the choices in (b) and (c) as necessary

In [None]:
model.most_similar(positive = ["war"])

## Exercise 3
Filter tweets to those that mention "Iran" or similar entities and "war" or similar entities
- Do this using just spacy
- Do this using word2vec similarity scores

In [23]:
# Using spacy
for tweet in tweets:
    parsed = nlp_toolkit(tweet)
    if mentions_country(parsed, "Iran") or mentions_country(parsed, "Iraq"): # ... you could add more
        if "attack" in get_actions(parsed):
            print(tweet)

In [27]:
# Using word2vec similarity scores
for tweet in tweets[:200]:
    parsed = nlp_toolkit(tweet)

    similarity_to_iran = max([model.similarity("Iran", tok.text) for tok in parsed if tok.text in model.wv.vocab], 0)
    similarity_to_war = max([model.similarity("war", tok.text) for tok in parsed if tok.text in model.wv.vocab], 0)
    if similarity_to_iran > 0.9 and similarity_to_war > 0.9:
#         print(similarity_to_iran, similarity_to_war, tweet)
        print tweet

I made a(n) Small Tourmaline in Paradise Island! https://t.co/cAoW1b6DRc #Gameinsight #Androidgames #Android

RT @PURELOVEBEAST: -เช็ครายละเอียด- 27th BIRTHDAY SPECIAL GOODS - 3D YOSEOP USB

https://t.co/EOfBdVQUfO

@ViGiGu google it :) simple

nerd ass girl  https://t.co/T7kDirxPEL

LeadCorp Media @leadcorpmedia_  https://t.co/vRJG9Xnzw8

RT mackdrama1017: ChieMoney use google and learn how to!

@ShaffieWeru Morning bro here is my new video i need yr help im talented bt i lack a manager bro https://t.co/NTs3QM5YU5

Google Play Gift Card Code

Claim your Google Play Gift Card Code... https://t.co/ySYH1x5kQl #amazon #itunes #googl… https://t.co/ayDI4X1FKO

@rizsrug216 @tpaquette_IID Per https://t.co/jTlhLW8Ry5 (sorry), St. Devs for the sport come to:

Baseball: .039

Football: .125

King of dark fantasy. Summon today. App Store: https://t.co/XeEuOEaXEG Google Play: https://t.co/vYQdbhrNEb #DarkSummoner

King of dark fantasy. Summon today. App Store: https://t.co/1ce13KchYS Google Play: 