# ¡Hola!
____________________
- **Name:** Maite Giménez.
- **Who?:** Ph.D. student at Universitat Politècnica de València.
- **Twitter:** @maidotgimenez
- **Github:** maigimenez



![PyladiesES!](imgs/pyladiesES.png "PyLadies Spain")

## Where are the slides?
It's a notebook and it's available here: [Github/maigimenez/ep2016_vect4word](https://github.com/maigimenez/ep2016_vect4word)

# Roadmap

1. Point 1
2. Point 2
3. Point 3

# 0. Make a plan!

![Science!](imgs/neil.gif "Loving science!")

## 0.1 Gather your data

In [None]:
from configparser import ConfigParser
from os.path import join
from os import pardir

In [None]:
config = ConfigParser()
config.read(join(pardir,'src','credentials.ini'))

APP_KEY = config['twitter']['app_key']
APP_SECRET = config['twitter']['app_secret']
OAUTH_TOKEN =  config['twitter']['oauth_token']
OAUTH_TOKEN_SECRET =  config['twitter']['oauth_token_secret']

In [None]:
from twitter import oauth, Twitter, TwitterHTTPError

In [None]:
auth = oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                   APP_KEY, APP_SECRET)

twitter_api = Twitter(auth=auth)
twitter_api.retry = True

*Full disclaimer: gender is non a binary issue. This is just a simplified example. If you are willing to expand this experiment, go ahead and contact me!*

In [None]:
brogrammers = ['jakevdp', 'rasbt', 'GaelVaroquaux', 'amuellerml', 'fperez_org', 
               'fpedregosa', 'ogrisel', 'dontusethiscode', 'randal_olson', 'tdhopper' ] 
sisgrammers = ['pkafei', 'LorenaABarba', 'jessicamckellar', 'heddle317', 'diana_clarke',
               'wholemilk', 'spang', 'cecilycarver', 'juliaelman', 'b0rk']

brotweets = []
for bro in brogrammers: 
    brotweets.extend(twitter_api.statuses.user_timeline(screen_name=bro, count=100))

sistweets = []
for sis in sisgrammers: 
    sistweets.extend(twitter_api.statuses.user_timeline(screen_name=sis, count=100))

## 0.1 Clean your data

In [None]:
import re

def clean_tweet(tweet):
    """ Simplest preprocess.

    Convert a tweet to lowercarse and replace URLs and @username by a generic token

    Args:
        tweet (str): Tweet to clean.

    Returns:
        str: Preprocessed tweet
    """
    
    tweet = tweet.lower()
    
    # Remove URL and replace them with a token
    URL_REGEX = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    tweet = re.sub(URL_REGEX, '<url>', tweet, flags=re.MULTILINE)
    
    # Remove usernames and replace them with a token
    tweet = re.sub("@([A-Za-z0-9_]+)", "<user>", tweet)

    # Remove repeated spaces
    tweet = re.sub(r"\s{2,}", " ", tweet)

    # If a character is repeated more than 4 time, keep only 3 repetitions.
    tweet = re.sub(r'(.)\1{4,}', r'\1\1\1', tweet)
    
    return tweet

In [None]:
import pandas as pd

dataset = []
# Gather the text 
for tweet in brotweets:
    cleaned_tweet = clean_tweet(tweet['text'])
    dataset.append({'id': tweet['id'], 'text': cleaned_tweet, 'class': 0})
for tweet in sistweets:
    cleaned_tweet = clean_tweet(tweet['text'])
    dataset.append({'id': tweet['id'], 'text': cleaned_tweet, 'class': 1})

pd_dataset = pd.DataFrame(dataset)

In [None]:
pd_dataset.head()

## 0.2. Share your data

In [None]:
pd_dataset.to_csv('../corpora/full_dataset.csv')

In [None]:
pd_dataset[['class', 'id']].to_csv('../corpora/ep16.csv')

![Toast!](imgs/toast.gif "Let's celebrate!")

## 0.3. Study your data

In [18]:
import pandas as pd

DATASET_PATH = "../corpora/full_dataset.csv"
pd_dataset = pd.DataFrame.from_csv(DATASET_PATH)
pd_dataset.head()

Unnamed: 0,class,id,text
0,0,752137929871462400,<user> great seeing you today!
1,0,751510274432131072,"""i just wanted to check that you got my previo..."
2,0,751374701344219136,rt <user>: <user> support for minor bugs may e...
3,0,751373414435328001,"could be controversial, but frankly i think it..."
4,0,751373066987573248,til that <user> plans to drop python 2 support...


In [19]:
import nltk.data
#nltk.download()   

In [20]:
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re
import scipy.stats as stats

In [21]:
print(', '.join(stopwords.words('english')[:20]))

i, me, my, myself, we, our, ours, ourselves, you, your, yours, yourself, yourselves, he, him, his, himself, she, her, hers


In [22]:
def get_vocabulary(corpus, tokenizer):
    stop_words = stopwords.words('english')

    # Remove puntuation marks
    no_punks = [re.sub(r'\W', ' ', tweet) for tweet in corpus]
    
    # Tokenize and remove stop words
    clean_tokens = []
    for tweet in no_punks:
        # Replace different numbers with a token
        tweet = re.sub(r"\.\d+\s*", ".<number> ", tweet)
        tweet = re.sub(r"\d+\s*", " <number> ", tweet)
    
        tokens = tokenizer(tweet)
        tokens = [token for token in tokens if token not in stop_words]
        clean_tokens.extend(tokens)

    # Build the vocabulary
    return Counter(clean_tokens)

In [23]:
tknzr = TweetTokenizer()

brotweets = pd_dataset[pd_dataset['class'] == 0]['text'].tolist()
sistweets = pd_dataset[pd_dataset['class'] == 1]['text'].tolist()

brocabulary = get_vocabulary(brotweets, tknzr.tokenize)
siscabulary = get_vocabulary(sistweets, tknzr.tokenize)

In [24]:
brocabulary.most_common(10)

[('user', 1313),
 ('url', 676),
 ('<number>', 483),
 ('rt', 420),
 ('python', 96),
 ('talk', 49),
 ('learning', 46),
 ('data', 45),
 ('new', 43),
 ('https', 42)]

In [25]:
siscabulary.most_common(10)

[('user', 1009),
 ('url', 419),
 ('<number>', 337),
 ('rt', 234),
 ('pycon', 54),
 ('one', 40),
 ('sgdq', 40),
 ('get', 39),
 ('like', 37),
 ('code', 30)]

In [26]:
from bokeh.plotting import figure, show, vplot, ColumnDataSource
from bokeh.io import output_notebook
from bokeh.models import HoverTool

output_notebook()

In [27]:
MOST_COMMON = 50

mc_brocavulary = brocabulary.most_common(int(MOST_COMMON/2))
mc_siscavulary = siscabulary.most_common(int(MOST_COMMON/2))

fr_brocavulary, fr_siscavulary  = [], []
most_common_words = mc_brocavulary + mc_siscavulary
words = list(set(word for word, _ in most_common_words))
for word in words:
    if word in brocabulary:
        fr_brocavulary.append(brocabulary[word])
    else:
        fr_brocavulary.append(0)
    if word in siscabulary:
        fr_siscavulary.append(siscabulary[word])
    else:
        fr_siscavulary.append(0)

In [28]:
import numpy as np
range_words=list(range(1,len(words)+1))
source = ColumnDataSource(data=dict(range_words=range_words,
                                    words=words,
                                    freq_true=fr_brocavulary, 
                                    freq_false=fr_siscavulary))

hover = HoverTool()
hover.point_policy = "follow_mouse"
hover = HoverTool(
        tooltips=[
            ("words", "@words"),
        ]
    )

TOOLS="pan,wheel_zoom,box_zoom,reset,save"


p = figure(title = "Vocabulary gender", x_range=words, tools=[TOOLS, hover])
p.xaxis.axis_label = 'Words'
p.yaxis.axis_label = 'Frequency'
p.circle('range_words', 'freq_true', source=source, fill_alpha=0.2, size=10, color="navy")
p.circle('range_words', 'freq_false', source=source, fill_alpha=0.2, size=10, color='red')
p.xaxis.major_label_orientation = np.pi/4  

show(p)

In [29]:
tweet_lens_bro = [len(tweet) for tweet in brotweets]
hist_bro, edges_bro = np.histogram(tweet_lens_bro, density=True, bins=20)
tweet_lens_bro.sort()

tweet_lens_sis = [len(tweet) for tweet in sistweets]
hist_sis, edges_sis = np.histogram(tweet_lens_sis, density=True, bins=20)
tweet_lens_sis.sort()

p = figure(title="")
p.quad(top=hist_bro, bottom=0, left=edges_bro[:-1], right=edges_bro[1:],
        fill_color="navy", line_color="#033649", fill_alpha=0.3)
p.quad(top=hist_sis, bottom=0, left=edges_sis[:-1], right=edges_sis[1:],
        fill_color="red", line_color="#033649", fill_alpha=0.3)

sigma =  np.std(tweet_lens_bro)
mu = np.mean(tweet_lens_bro)
pdf = stats.norm.pdf(tweet_lens_bro, mu, sigma)
p.line(tweet_lens_bro, pdf, line_color="navy", line_width=6, alpha=0.7, legend="PDF")

sigma =  np.std(tweet_lens_sis)
mu = np.mean(tweet_lens_sis)
pdf = stats.norm.pdf(tweet_lens_sis, mu, sigma)
p.line(tweet_lens_sis, pdf, line_color="red", line_width=6, alpha=0.7, legend="PDF")

p.xaxis.axis_label = 'len(tweets)'
p.yaxis.axis_label = '# tweets'
show(p)


## 0.4 Split your data in train and test 
*(and probably share it too)*

In [30]:
from sklearn import cross_validation
X = pd_dataset['text'].tolist()
y = pd_dataset['class'].tolist()

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
print("Examples in train: {}".format(len(X_train)))
print("Examples in test: {}".format(len(X_test)))

Examples in train: 1200
Examples in test: 800


#  1. In the beginning we had a bag of words *(or maybe the set)* 

## 1.1. What is a bag of words

- Simplest way to represent text.
- Create a vector with the size of the vocabulary seen in train.
- Each sentence is represented counting the number of times each word appears.

```python
dataset = ["I love Python", "I love NLP", "Pyladies are cool"]
vocabulary = set(["I", "love", "Python", "NLP", "Pyladies", "are", "cool"])
dataset_representation =  [[1,1,1,0,0,0,0], 
                           [1,1,0,1,0,0,0],
                           [0,0,0,0,1,1,1]] 
```

Ok, that looks cool. This look solved. Let's party! 

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word", 
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             ngram_range=(1, 1),
                             max_features = 5000) 

# Fit the train
BOW_train = vectorizer.fit_transform(X_train)
BOW_train = BOW_train.toarray()

# Transform the test
BOW_test = vectorizer.transform(X_test)
BOW_test = BOW_test.toarray()


print('Train: {{0|1}}^({}x{})'.format(BOW_train.shape[0], BOW_train.shape[1]))
print('Test:  {{0|1}}^({}x{})'.format(BOW_test.shape[0], BOW_test.shape[1]))
vocab = vectorizer.get_feature_names()
print('\nVOCABULARY EXTRACT: {}'.format(', '.join(vocab[500:600])))
np.set_printoptions(threshold=np.nan)
print('\nTWEET REPRESENTATION: {}'.format(BOW_train[0]))

Train: {0|1}^(1200x4060)
Test:  {0|1}^(800x4060)

VOCABULARY EXTRACT: broke, broken, brother, brownlee, browser, bruce, brunch, brutal, btw, bu, buddy, buffer, bug, bugfixes, buggy, bugs, build, building, built, bullies, bump, bunch, burnt, bursting, busy, but, butler, button, buy, buys, buzz, by, byegium, ca, cabin, cable, cableporn, caesar, caged, call, called, calling, calls, came, camp, campaign, camping, can, canada, candidate, cannot, capita, capitalism, capitalists, capn, captain, captains, capture, car, card, cardinality, cards, career, careful, caribbean, caring, carpentry, cars, cartoon, case, cash, cat, catch, categorical, catered, cats, catty, caught, causal, cause, caused, cc, celebrate, celebrating, cena, centenial, center, centrality, centre, century, cernan, certainly, ceval, cfd, cffi, cfp, chair, chalk, challenge, challenges

TWEET REPRESENTATION: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## 1.2. So, what's the problem, then?


- Unseen words.
- One-hot Representation: Represents every word as an $\mathbb{R}^{|V|×1}$
```python
hotel = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
motel = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
w hotel AN D w motel = 0
```
$w^{hotel} AND w^{motel} = 0
&nbsp;


- *The curse of dimensionality!!!*
  - Generalizing locally (eg. nearest neighbors) requires representative examples for all relevant variations
  - The number of possible configurations of the variables of interest is much larger than the number of training samples.


&nbsp;
![Curse!](imgs/curse.jpg "The curse of the dimensionality")


#  2. Word embeddings

> You shall know a word by the company it keeps.
>
> -- <cite>J.R. Firth 1957:11</cite>

## 2.1. Pretrained data using GloVe

In [32]:
import numpy as np

def load_glove_dict(glove_file):
    glove_embeddings = {}
    with open(glove_file) as glove_file:
        for line in glove_file:
            split_line = line.split() 
            word, vector = split_line[0], np.asarray(split_line[1:])
            glove_embeddings[word] =  vector
    return glove_embeddings

In [None]:
GLOVE_PATH = '/home/mgimenez/Dev/resources/GloVe/twitter_dataset'

embedding_size = '25'
glove_file = join(GLOVE_PATH, 'glove.twitter.27B.' + embedding_size + 'd.txt')
glove_25 = load_glove_dict(glove_file)

In [None]:
embedding_size = '100'
glove_file = join(GLOVE_PATH, 'glove.twitter.27B.' + embedding_size + 'd.txt')
glove_100 = load_glove_dict(glove_file)

In [38]:
embedding_size = '200'
glove_file = join(GLOVE_PATH, 'glove.twitter.27B.' + embedding_size + 'd.txt')
glove_200 = load_glove_dict(glove_file)

In [39]:
def get_most_common_vocab(most_common, brocabulary, siscabulary):
    mc_brocavulary = brocabulary.most_common(int(most_common/2))
    mc_siscavulary = siscabulary.most_common(int(most_common/2))

    most_common_words = mc_brocavulary + mc_siscavulary
    return set(word for word, _ in most_common_words)

In [40]:
def get_words_to_plot(most_common, dictionary):
    words_to_plot = {}
    unseen_words = []
    for word in get_most_common_vocab(most_common, brocabulary, siscabulary):
        if word in dictionary:
            words_to_plot[word] = dictionary[word]
        else:
            unseen_words.append(word)
    return words_to_plot, unseen_words

In [41]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
words_to_plot, unseen_words = get_words_to_plot(1000, glove_100)

low_dim_embs = tsne.fit_transform(list(words_to_plot.values()))

In [42]:
print(', '.join(unseen_words))

resnet, ohbm, scikit, automl, nilearn, subsampling, pydatabln, icml, numfocus, uwsgi, reproducibility, pydata, sgdq, factorization, matplotlib, euroscipy, decoders, preprint, __past__, machinelearning, ipython, ngcmsummeracademy, scipy, pypi, brexit, datascience, joblib, polyconf, jupyter, deeplearning, numpy, sklearn, tensorflow, alifexv, pydatalondon, pydataparis, sparsity, dadaist, neuroimaging, print_statement


In [43]:
import numpy as np
words = list(words_to_plot.keys())
range_words=list(range(1,len(words)+1))

source = ColumnDataSource(data=dict(range_words=range_words,
                                    words=words,
                                    x=low_dim_embs[:,0], 
                                    y=low_dim_embs[:,1]))

hover = HoverTool()
hover.point_policy = "follow_mouse"
hover = HoverTool(
        tooltips=[
            ("words", "@words"),
        ]
    )

TOOLS="pan,wheel_zoom,box_zoom,reset,save"


p = figure(title = "Word visualization", tools=[TOOLS, hover])
p.circle('x', 'y', source=source, fill_alpha=0.2, size=10, color='navy')

show(p)

In [44]:
def tokenize_dataset(tokenizer, dataset):
    tokenize_dataset = []
    for tweet in dataset:
        # Replace different numbers with a token
        tweet = re.sub(r"\.\d+\s*", ".<number> ", tweet)
        tweet = re.sub(r"\d+\s*", " <number> ", tweet)
        tokens = tokenizer(tweet)
        tokenize_dataset.append(tokens)
    return tokenize_dataset

In [45]:
X_train_tokenized = tokenize_dataset(TweetTokenizer().tokenize, X_train)
X_test_tokenized = tokenize_dataset(TweetTokenizer().tokenize, X_test)

In [46]:
def get_embeddings(dataset, dictionary, embedding_size):
    X_emebeddings = []
    for tweet in dataset:
        tweet_embeddings = []
        for word in tweet:
            if word in dictionary:
                tweet_embeddings.append(dictionary[word])
        if not tweet_embeddings:
            tweet_embeddings.append(np.zeros(embedding_size))
        # Each tweet would have a different number of words and ML techniques requiere fixed inputs. 
        X_emebeddings.append(np.mean(np.asarray(tweet_embeddings, dtype=np.float32), axis=0))
    return X_emebeddings

In [47]:
X_train_GloVe = get_embeddings(X_train_tokenized, glove_100, 100)
X_test_GloVe = get_embeddings(X_test_tokenized, glove_100, 100)

## 2.2. But I want to train word embeddings with my own data!

In [49]:
from gensim.models import word2vec

In [50]:
# Initialize and train the model (this will take some time)
model = word2vec.Word2Vec(X_train_tokenized, 
                          workers = 4,
                          size = 100,  
                          min_count = 1,     # How many times a word should appear to be taken into account
                          window = 5, 
                          sample = 1e-3 ,    # Downsample setting for frequent words
                          batch_words = 100) # Batches of examples passed to worker threads 

# This model won't be updated
model.init_sims(replace=True)

model_name = "word2vec"
model.save(model_name)

In [51]:
model.syn0.shape

(4193, 100)

In [52]:
from sklearn.manifold import TSNE

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
words_to_plot, unseen_words = get_words_to_plot(1000, model)

low_dim_embs = tsne.fit_transform(list(words_to_plot.values()))

In [53]:
words = list(words_to_plot.keys())
range_words=list(range(1,len(words)+1))

source = ColumnDataSource(data=dict(range_words=range_words,
                                    words=words,
                                    x=low_dim_embs[:,0], 
                                    y=low_dim_embs[:,1]))

hover = HoverTool()
hover.point_policy = "follow_mouse"
hover = HoverTool(
        tooltips=[
            ("words", "@words"),
        ]
    )

TOOLS="pan,wheel_zoom,box_zoom,reset,save"


p = figure(title = "Word visualization", tools=[TOOLS, hover])
p.circle('x', 'y', source=source, fill_alpha=0.2, size=10, color='navy')

show(p)

In [54]:
model.most_similar("python")

[('the', 0.9998200535774231),
 ('to', 0.9998089671134949),
 ('a', 0.999801516532898),
 ('!', 0.99979567527771),
 ('i', 0.999795138835907),
 ('and', 0.9997925758361816),
 ('for', 0.9997901916503906),
 ('<url>', 0.9997870922088623),
 ('"', 0.9997828006744385),
 ('in', 0.9997826814651489)]

In [55]:
X_train_word2vec = get_embeddings(X_train_tokenized, model, 100)
X_test_word2vec = get_embeddings(X_test_tokenized, model, 100)

##  3.  Now fight! 

In [56]:
from sklearn import svm
from sklearn.metrics import classification_report, roc_curve, auc

In [57]:
clf = svm.SVC()
clf.fit(BOW_train, y_train)  
predicction_BOW = clf.predict(BOW_test)
target_names = ['Bros', 'Sis']
print(classification_report(y_test, predicction_BOW,  target_names=target_names))

             precision    recall  f1-score   support

       Bros       0.00      0.00      0.00       411
        Sis       0.49      1.00      0.65       389

avg / total       0.24      0.48      0.32       800



In [58]:
clf = svm.SVC()
clf.fit(X_train_GloVe, y_train)
predicction_GloVe = clf.predict(X_test_GloVe)
print(classification_report(y_test, predicction_GloVe,  target_names=target_names))

             precision    recall  f1-score   support

       Bros       0.66      0.60      0.62       411
        Sis       0.61      0.67      0.64       389

avg / total       0.63      0.63      0.63       800



In [94]:
clf = svm.SVC()
clf.fit(X_train_word2vec, y_train)
predicction_word2vec = clf.predict(X_test_word2vec)
print(classification_report(y_test, predicction_word2vec,  target_names=target_names))

             precision    recall  f1-score   support

       Bros       0.00      0.00      0.00       411
        Sis       0.49      1.00      0.65       389

avg / total       0.24      0.49      0.32       800



  'precision', 'predicted', average, warn_for)


In [95]:
false_positive_rate_bow, true_positive_rate_bow, _ = roc_curve(y_test, predicction_BOW)
roc_auc_bow = auc(false_positive_rate_bow, true_positive_rate_bow)

false_positive_rate_glove, true_positive_rate_glove, _ = roc_curve(y_test, predicction_GloVe)
roc_auc_glove = auc(false_positive_rate_glove, true_positive_rate_glove)

false_positive_rate_w2v, true_positive_rate_w2v, _ = roc_curve(y_test, predicction_word2vec)
roc_auc_w2v = auc(false_positive_rate_w2v, true_positive_rate_w2v)

In [96]:
from bokeh.palettes import Spectral6

p = figure(title="Receiver Operating Characteristic", tools=TOOLS)


p.line(false_positive_rate_bow, true_positive_rate_bow, legend='BoW ROC curve (area = {:.2f})'.format(roc_auc_bow), 
       line_color="green", line_width=2)
p.line(false_positive_rate_glove, true_positive_rate_glove, 
       legend='GloVE ROC curve (area = {:.2f})'.format(roc_auc_glove), 
       line_color="blue", line_width=2)
p.line(false_positive_rate_w2v, true_positive_rate_w2v, 
       legend='W2V ROC curve (area = {:.2f})'.format(roc_auc_w2v), 
       line_color="yellow", line_width=2)


p.line([0.0, 1.0], [0.0, 1.05], legend='Guessing', 
       line_color="gray", line_width=2, line_dash=(4, 4))

p.xaxis.axis_label = 'False Positive Rate'
p.yaxis.axis_label = 'True Positive Rate'

p.legend.location = 'bottom_right'
show(p)  