In [7]:
import pandas
import itertools as it
import string
from collections import Counter
import json
import re
import numpy

In [8]:
with open('data/stopwords.txt') as fh:
    stopwords = set(fh.read().splitlines())

IOError: [Errno 2] No such file or directory: 'data/stopwords.txt'

In [9]:
def normalise(text):
    
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", " <url> ", text)
    text = text.replace("/", " / ")
    text = re.sub(r"@\w+", " <user> ", text)
    text = re.sub(r"[8:=;]['`\-][)d]+|[)d]+['`\-][8:=;]", " <smile> ", text)
    text = re.sub(r"[8:=;]['`\-]p+", " <lolface> ", text)
    text = re.sub(r"[8:=;]['`\-]\(+|\)+['`\-][8:=;]", " <sadface> ", text)
    text = re.sub(r"[8:=;]['`\-][\/|l*]", " <neutralface> ", text)
    text = text.replace(r"<3", " <heart> ")
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", " <number> ", text)
    text = text.replace(r"#", " <hashtag> ")
    text = re.sub(r"([!?.,()])+", r" \1 ", text)
    
    return text

In [4]:
def tokenise(text):
    return [t.strip() for t in text.strip().split()]

In [5]:
def parse_treatment_definitons(definion_file):
    treatment_set = set()
    treatment_mapping = {}
    max_length = 1
    for line in definion_file:
        line = line.strip()
        treatments = line.split(',')
        name = treatments[0]
        for treatment in treatments:
            treatment = tuple(tokenise(normalise(treatment)))
            max_length = max(len(treatment), max_length)
            treatment_set.add(treatment)
            treatment_mapping[treatment] = name
    return treatment_set, treatment_mapping, max_length

In [6]:
def window_sliding(iterable, n):
    gens = (
        it.chain(it.repeat(None, n - 1 - i), iterable, it.repeat(None, i))
        for i, gen in enumerate(it.tee(iterable, n)))
    return zip(*gens)

def find_treatments(tokens):
    found_treatments = []
    for x in range(max_treatment_length, 0, -1):
        for window in window_sliding(tokens, x):
            if tuple(window) in treatment_set:
                found_treatments.append(treatment_mapping[tuple(window)])
    return list(set(found_treatments)) or None

In [7]:
def filter_stopwords(tokens):
    return [t for t in tokens if len(t) > 2 and t not in stopwords]

In [8]:
def calculate_embeddings(tokens):
    token_embdgs = [embeddings.get(t) for t in tokens]
    token_embdgs = [t for t in token_embdgs if t is not None]
    return numpy.mean(token_embdgs, axis=0)

In [10]:
with open('data/merged_tweets.jsonl', 'r') as fh:
    df = pandas.read_json(fh.read(), lines=True, convert_dates=True)

IOError: [Errno 2] No such file or directory: 'data/merged_tweets.jsonl'

In [11]:
df = df.loc[(df['retweeted_status'].isnull()) & (df['lang'] == "en")]

In [12]:
df = df.loc[(df.created_at >= "2017-12-04") & (df.created_at <= "2018-02-03")]

In [13]:
df["extended_tweet"] = df["extended_tweet"].apply(lambda x: x["full_text"]  if type(x) == dict else x )

In [14]:
df.head()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,...,reply_count,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user,withheld_in_countries
11631,,,2017-12-04 00:00:52,"[13, 140]","{u'user_mentions': [{u'indices': [0, 12], u'id...",,@germanBruin I watched a Ted talk about a blac...,0,False,low,...,0.0,0,False,,"<a href=""http://twitter.com/download/android"" ...",@germanBruin I watched a Ted talk about a blac...,2017-12-04 00:00:52.006,True,"{u'follow_request_sent': None, u'profile_use_b...",
11636,,,2017-12-04 00:09:44,,"{u'user_mentions': [], u'symbols': [], u'hasht...",,,0,False,low,...,0.0,0,False,,"<a href=""http://twitter.com/#!/download/ipad"" ...",😻😻😻Neither will I or loads of others. If THIS ...,2017-12-04 00:09:44.732,False,"{u'follow_request_sent': None, u'profile_use_b...",
11645,,,2017-12-04 00:21:11,,"{u'user_mentions': [], u'symbols': [], u'hasht...",,,0,False,low,...,0.0,0,False,,"<a href=""http://www.forextraining.com"" rel=""no...",#Tinnitus can be perceived in one or both ears...,2017-12-04 00:21:11.003,False,"{u'follow_request_sent': None, u'profile_use_b...",
11647,,,2017-12-04 00:25:03,,"{u'user_mentions': [], u'symbols': [], u'hasht...",,,0,False,low,...,0.0,0,False,,"<a href=""http://twitter.com/download/iphone"" r...",And now i just sit in silence...\n\nJust kiddi...,2017-12-04 00:25:03.859,False,"{u'follow_request_sent': None, u'profile_use_b...",
11648,,,2017-12-04 00:25:15,,"{u'user_mentions': [], u'symbols': [], u'hasht...",,,0,False,low,...,0.0,0,False,,"<a href=""http://bufferapp.com"" rel=""nofollow"">...",Have a weird half tinnitus half ears want to p...,2017-12-04 00:25:15.339,False,"{u'follow_request_sent': None, u'profile_use_b...",


In [15]:
df["text"] = df["extended_tweet"].combine_first(df["text"])

In [16]:
df["text"] = df["text"].apply(lambda x: x.replace("\n", " "))

In [17]:
df["user_name"] = df["user"].apply(lambda x: x["screen_name"])

In [18]:
df = df.loc[:, ["id", "created_at", "text", "retweet_count", "user_name"]]

In [19]:
df["tokens"] = df["text"].apply(lambda x: tokenise(normalise(x)))

In [20]:
len(df.user_name.unique())

7133

In [21]:
treatment_set, treatment_mapping, max_treatment_length = parse_treatment_definitons(open("data/treatment_definitons.txt", 'r'))

In [22]:
df['filtered_tokens'] = df['tokens'].apply(filter_stopwords)

In [None]:
word_counts = Counter()
for elem in df["filtered_tokens"]:
    word_counts += Counter(elem)

In [None]:
word_counts.most_common(1)

In [None]:
df['treatments'] = df['tokens'].apply(find_treatments)

In [None]:
df = df.loc[df['treatments'].notnull()]

In [None]:
len(df.user_name.unique())

In [None]:
df.to_csv("tweets_with_treatments.csv")

In [None]:
treatment_counts = Counter()
for elem in df["treatments"]:
    treatment_counts += Counter(elem)

In [None]:
treatment_counts.most_common(100)