In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import json
from tqdm import tqdm

from collections import Counter, defaultdict, namedtuple

import re

from utilities import *

In [2]:
sns.set_context('paper')
sns.set_style('ticks')

In [3]:
FRIENDS_FILE="data/friends.json"
TWEETS_FILE="data/tweets.json"

In [4]:
TweetDataTuple=namedtuple("TweetData", "text hashtags mentions urls")

def extract_screen_name(tweet_data):
    return tweet_data["user"]["screen_name"]

def extract_tweet_data(tweet_data):
    hashtags = extract_hashtags(tweet_data)
    mentions = extract_mentions(tweet_data)
    url_domains = extract_url_domains(tweet_data)
    text = tweet_data["full_text"]
    return TweetDataTuple(text, hashtags, mentions, url_domains)

tweet_data_extractor = make_tweet_data_extractor([extract_screen_name, extract_tweet_data])    

In [5]:
with open(FRIENDS_FILE) as fp:
    users_data = dict()
    for line in tqdm(fp):
        screen_name, user_data = extract_user_data(line)
        users_data[screen_name] = user_data

671it [00:00, 7657.80it/s]


In [6]:
with open(TWEETS_FILE) as fp:
    tweets_data = defaultdict(list)
    for line in tqdm(fp):
        screen_name, tweet_data = tweet_data_extractor(line)
        tweets_data[screen_name].append(tweet_data)

64382it [00:06, 9899.23it/s] 


## Create datasets for Starspace

In [7]:
def user_docs_types(filename, tweets_data, label="hashtags"):
    with open(filename, 'w+', encoding='utf-8') as fp:
        for screen_name, tweets in tweets_data.items():
            all_documents = [" ".join(getattr(tweet, label))
                             for tweet in tweets if getattr(tweet, label)]
            print("\t".join(all_documents).lower(), file=fp)

In [8]:
user_docs_types("data/user_hashtag_docs.txt", tweets_data, label="hashtags")
user_docs_types("data/user_mention_docs.txt", tweets_data, label="mentions")

In [9]:
%%bash
/homed/smishra8/Downloads/Starspace/starspace train\
    -trainFile data/user_hashtag_docs.txt -model models/user_hashtag_docs -trainMode 1 -fileFormat labelDoc

Arguments: 
lr: 0.01
dim: 10
epoch: 5
maxTrainTime: 8640000
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 50
thread: 10
minCount: 1
minCountLabel: 1
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 1
fileFormat: labelDoc
normalizeText: 1
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/user_hashtag_docs.txt
Loading data from file : data/user_hashtag_docs.txt
Total number of examples loaded : 612
Training epoch 0: 0.01 0.002

 ---+++                Epoch    0 Train error : 0.04161384 +++--- â
Training epoch 1: 0.008 0.002

 ---+++                Epoch    1 Train error : 0.01914659 +++--- â
Training epoch 2: 0.006 0.002

 ---+++                Epoch    2 Train error : 0.01437732 +++--- â
Training epoch 3: 0.004 0.002

 ---+++                Epoch    3 Train error : 0.01188871 +++--- â
Training epoch 4: 0.002 0.002

 ---+++                Epoch    4 Train error : 0.01305983 +++--- â

Read 0M words
Number of words in dictionary:  6926
Number of labels in dictionary: 0
Epoch: 98.5%  lr: 0.010000  loss: 0.045242  eta: <1min   tot: 0h0m0s  (19.7%)Epoch: 98.5%  lr: 0.008000  loss: 0.019683  eta: <1min   tot: 0h0m0s  (39.7%)Epoch: 98.5%  lr: 0.006000  loss: 0.011647  eta: <1min   tot: 0h0m0s  (59.7%)Epoch: 98.5%  lr: 0.004000  loss: 0.013096  eta: <1min   tot: 0h0m0s  (79.7%)Epoch: 98.5%  lr: 0.002000  loss: 0.015656  eta: <1min   tot: 0h0m0s  (99.7%)

In [10]:
%%bash
/homed/smishra8/Downloads/Starspace/starspace train\
    -trainFile data/user_mention_docs.txt -model models/user_mention_docs -trainMode 1 -fileFormat labelDoc -normalizeText 0

Arguments: 
lr: 0.01
dim: 10
epoch: 5
maxTrainTime: 8640000
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 50
thread: 10
minCount: 1
minCountLabel: 1
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 1
fileFormat: labelDoc
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/user_mention_docs.txt
Loading data from file : data/user_mention_docs.txt
Total number of examples loaded : 649
Training epoch 0: 0.01 0.002

 ---+++                Epoch    0 Train error : 0.04337383 +++--- â
Training epoch 1: 0.008 0.002

 ---+++                Epoch    1 Train error : 0.01443337 +++--- â
Training epoch 2: 0.006 0.002

 ---+++                Epoch    2 Train error : 0.01172100 +++--- â
Training epoch 3: 0.004 0.002

 ---+++                Epoch    3 Train error : 0.00936020 +++--- â
Training epoch 4: 0.002 0.002

 ---+++                Epoch    4 Train error : 0.01163457 +++--- â

Read 0M words
Number of words in dictionary:  23051
Number of labels in dictionary: 0
Epoch: 98.6%  lr: 0.010000  loss: 0.045959  eta: <1min   tot: 0h0m0s  (19.7%)Epoch: 98.6%  lr: 0.008000  loss: 0.010431  eta: <1min   tot: 0h0m0s  (39.7%)Epoch: 98.6%  lr: 0.006000  loss: 0.014189  eta: <1min   tot: 0h0m0s  (59.7%)Epoch: 98.6%  lr: 0.004000  loss: 0.005728  eta: <1min   tot: 0h0m0s  (79.7%)Epoch: 98.6%  lr: 0.002000  loss: 0.011894  eta: <1min   tot: 0h0m0s  (99.7%)

In [11]:
from nltk.tokenize.casual import TweetTokenizer

In [12]:
Tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False)

In [13]:
def tweet_types_data(filename, tweets_data, label="hashtags"):
    with open(filename, 'w+', encoding='utf-8') as fp:
        for screen_name, tweets in tweets_data.items():
            for tweet in tweets:
                labels = getattr(tweet, label)
                if not labels:
                    continue
                text = " ".join(Tokenizer.tokenize(tweet.text))
                labels = " ".join(["#{}".format(l) for l in labels])
                print("\t".join([text, labels]).lower(), file=fp)
                

In [14]:
tweet_types_data("data/tweet_hashtag.txt", tweets_data, label="hashtags")
tweet_types_data("data/tweet_mention.txt", tweets_data, label="mentions")

In [15]:
%%bash
/homed/smishra8/Downloads/Starspace/starspace train\
    -trainFile data/tweet_hashtag.txt -model models/tweet_hashtag -label '#' -normalizeText 0

Arguments: 
lr: 0.01
dim: 10
epoch: 5
maxTrainTime: 8640000
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 50
thread: 10
minCount: 1
minCountLabel: 1
label: #
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 0
fileFormat: fastText
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/tweet_hashtag.txt
Loading data from file : data/tweet_hashtag.txt
Total number of examples loaded : 17602
Training epoch 0: 0.01 0.002

 ---+++                Epoch    0 Train error : 0.04234586 +++--- â
Training epoch 1: 0.008 0.002

 ---+++                Epoch    1 Train error : 0.01399041 +++--- â
Training epoch 2: 0.006 0.002

 ---+++                Epoch    2 Train error : 0.00687298 +++--- â
Training epoch 3: 0.004 0.002

 ---+++                Epoch    3 Train error : 0.00467202 +++--- â
Training epoch 4: 0.002 0.002

 ---+++                Epoch    4 Train error : 0.00343028 +++--- â
Saving model 

Read 0M words
Number of words in dictionary:  44690
Number of labels in dictionary: 7311
Epoch: 5.1%  lr: 0.009882  loss: 0.057750  eta: <1min   tot: 0h0m0s  (1.0%)Epoch: 10.2%  lr: 0.009765  loss: 0.058404  eta: <1min   tot: 0h0m0s  (2.0%)Epoch: 15.3%  lr: 0.009647  loss: 0.056842  eta: <1min   tot: 0h0m0s  (3.1%)Epoch: 20.4%  lr: 0.009647  loss: 0.053815  eta: <1min   tot: 0h0m0s  (4.1%)Epoch: 25.5%  lr: 0.009529  loss: 0.052572  eta: <1min   tot: 0h0m0s  (5.1%)Epoch: 30.6%  lr: 0.009529  loss: 0.051712  eta: <1min   tot: 0h0m0s  (6.1%)Epoch: 35.8%  lr: 0.009294  loss: 0.050858  eta: <1min   tot: 0h0m0s  (7.2%)Epoch: 40.9%  lr: 0.009294  loss: 0.049861  eta: <1min   tot: 0h0m0s  (8.2%)Epoch: 46.0%  lr: 0.009059  loss: 0.048855  eta: <1min   tot: 0h0m0s  (9.2%)Epoch: 51.1%  lr: 0.008941  loss: 0.047221  eta: <1min   tot: 0h0m0s  (10.2%)Epoch: 56.2%  lr: 0.008824  loss: 0.046472  eta: <1min   tot: 0h0m0s  (11.2%)Epoch: 61.3%  lr: 0.008824  loss: 0.045793  eta: <1min   tot:

In [16]:
%%bash
/homed/smishra8/Downloads/Starspace/starspace train\
    -trainFile data/tweet_mention.txt -model models/tweet_mention -label '#' -normalizeText 0

Arguments: 
lr: 0.01
dim: 10
epoch: 5
maxTrainTime: 8640000
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 50
thread: 10
minCount: 1
minCountLabel: 1
label: #
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 0
fileFormat: fastText
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/tweet_mention.txt
Loading data from file : data/tweet_mention.txt
Total number of examples loaded : 43549
Training epoch 0: 0.01 0.002

 ---+++                Epoch    0 Train error : 0.04165869 +++--- â
Training epoch 1: 0.008 0.002

 ---+++                Epoch    1 Train error : 0.01560798 +++--- â
Training epoch 2: 0.006 0.002

 ---+++                Epoch    2 Train error : 0.00829013 +++--- â
Training epoch 3: 0.004 0.002

 ---+++                Epoch    3 Train error : 0.00571649 +++--- â
Training epoch 4: 0.002 0.002

 ---+++                Epoch    4 Train error : 0.00427885 +++--- â
Saving model 

Read 0M words
Number of words in dictionary:  85615
Number of labels in dictionary: 28459
Epoch: 2.0%  lr: 0.010000  loss: 0.056597  eta: <1min   tot: 0h0m0s  (0.4%)Epoch: 4.1%  lr: 0.009860  loss: 0.056300  eta: <1min   tot: 0h0m0s  (0.8%)Epoch: 6.2%  lr: 0.009814  loss: 0.056161  eta: <1min   tot: 0h0m0s  (1.2%)Epoch: 8.2%  lr: 0.009814  loss: 0.055461  eta: <1min   tot: 0h0m0s  (1.6%)Epoch: 10.3%  lr: 0.009767  loss: 0.056150  eta: <1min   tot: 0h0m0s  (2.1%)Epoch: 12.4%  lr: 0.009721  loss: 0.055862  eta: <1min   tot: 0h0m0s  (2.5%)Epoch: 14.4%  lr: 0.009721  loss: 0.055431  eta: <1min   tot: 0h0m0s  (2.9%)Epoch: 16.5%  lr: 0.009721  loss: 0.055123  eta: <1min   tot: 0h0m0s  (3.3%)Epoch: 18.6%  lr: 0.009721  loss: 0.054086  eta: <1min   tot: 0h0m0s  (3.7%)Epoch: 20.6%  lr: 0.009674  loss: 0.053679  eta: <1min   tot: 0h0m0s  (4.1%)Epoch: 22.7%  lr: 0.009581  loss: 0.053036  eta: <1min   tot: 0h0m0s  (4.5%)Epoch: 24.8%  lr: 0.009488  loss: 0.053222  eta: <1min   tot: 0h0

In [17]:
def tokens_data(filename, tweets_data):
    with open(filename, 'w+', encoding='utf-8') as fp:
        for screen_name, tweets in tweets_data.items():
            for tweet in tweets:
                text = " ".join(Tokenizer.tokenize(tweet.text)).lower()
                print(text, file=fp)
                

In [18]:
tokens_data("data/tokens.txt", tweets_data)

In [19]:
%%bash
/homed/smishra8/Downloads/Starspace/starspace train\
    -trainFile data/tokens.txt -model models/tokens -trainMode 5 -normalizeText 0

Arguments: 
lr: 0.01
dim: 10
epoch: 5
maxTrainTime: 8640000
loss: hinge
margin: 0.05
similarity: cosine
maxNegSamples: 10
negSearchLimit: 50
thread: 10
minCount: 1
minCountLabel: 1
label: __label__
ngrams: 1
bucket: 2000000
adagrad: 1
trainMode: 5
fileFormat: fastText
normalizeText: 0
dropoutLHS: 0
dropoutRHS: 0
Start to initialize starspace model.
Build dict from input file : data/tokens.txt
Loading data from file : data/tokens.txt
Total number of examples loaded : 64400
Training epoch 0: 0.01 0.002


Read 1M words
Number of words in dictionary:  124239
Number of labels in dictionary: 0
bash: line 2: 18047 Floating point exception/homed/smishra8/Downloads/Starspace/starspace train -trainFile data/tokens.txt -model models/tokens -trainMode 5 -normalizeText 0
