In [41]:
import pandas
import itertools as it
import string
from collections import Counter
import json
from gensim.models import Word2Vec
import nltk

In [3]:
punctuation_translator = str.maketrans(
            string.punctuation, " " * len(string.punctuation))

In [4]:
with open('data/stopwords.txt') as fh:
    stopwords = set(fh.read().splitlines())

In [5]:
def normalise(text):
    return " ".join(
        text.translate(punctuation_translator).replace("\n", " ").lower().split()
    )

In [6]:
def tokenise(text):
    return [t.strip() for t in text.strip().split(" ")]

In [7]:
def parse_treatment_definitons(definion_file):
    treatment_set = set()
    treatment_mapping = {}
    max_length = 1
    for line in definion_file:
        line = line.strip()
        treatments = line.split(',')
        name = treatments[0]
        for treatment in treatments:
            treatment = tuple(tokenise(normalise(treatment)))
            max_length = max(len(treatment), max_length)
            treatment_set.add(treatment)
            treatment_mapping[treatment] = name
    return treatment_set, treatment_mapping, max_length

In [8]:
def window_sliding(iterable, n):
    gens = (
        it.chain(it.repeat(None, n - 1 - i), iterable, it.repeat(None, i))
        for i, gen in enumerate(it.tee(iterable, n)))
    return zip(*gens)

def find_treatments(text):
    tokens = tokenise(normalise(text))
    found_treatments = []
    for x in range(max_treatment_length, 0, -1):
        for window in window_sliding(tokens, x):
            if tuple(window) in treatment_set:
                found_treatments.append(treatment_mapping[tuple(window)])
    return found_treatments or None

In [9]:
def filter_stopwords(text):
    tokens = tokenise(normalise(text))
    return [t for t in tokens if len(t) > 2 and t not in stopwords]

In [10]:
with open('data/merged_tweets.jsonl', 'r') as fh:
    df = pandas.read_json(fh.read(), lines=True, convert_dates=True)

In [11]:
df = df.loc[(df['retweeted_status'].isnull()) & (df['lang'] == "en")]

In [12]:
df = df.loc[(df.created_at.dt.weekofyear > 48) | (df.created_at.dt.weekofyear < 3)]

In [13]:
df["extended_tweet"] = df["extended_tweet"].apply(lambda x: x["full_text"]  if type(x) == dict else x )

In [14]:
df['text'] = df['extended_tweet'].combine_first(df["text"])

In [15]:
df["text"] = df["text"].apply(lambda x: x.replace("\n", " "))

In [16]:
df["user_name"] = df["user"].apply(lambda x: x["screen_name"])

In [19]:
df = df[["id", "created_at", "text", "retweet_count", "user_name"]]
df.head(5)

Unnamed: 0,id,created_at,text,retweet_count,user_name
11631,937471728049147906,2017-12-04 00:00:52,@germanBruin I watched a Ted talk about a blac...,0,adrenalun
11636,937473962463973378,2017-12-04 00:09:44,😻😻😻Neither will I or loads of others. If THIS ...,0,Lunarteddy
11645,937476840893173761,2017-12-04 00:21:11,#Tinnitus can be perceived in one or both ears...,0,TinnitusErase
11647,937477817561837568,2017-12-04 00:25:03,And now i just sit in silence... Just kidding...,0,Tannerguest1
11648,937477865712603136,2017-12-04 00:25:15,Have a weird half tinnitus half ears want to p...,0,FourQ


In [20]:
df = df.sort_values("created_at")

In [21]:
treatment_set, treatment_mapping, max_treatment_length = parse_treatment_definitons(open("data/treatment_definitons.txt", 'r'))

In [22]:
df['treatments'] = df['text'].apply(find_treatments)

In [23]:
df['filtered_tokens'] = df['text'].apply(filter_stopwords)

In [45]:
df = df.loc[df['treatments'].notnull()]
temp_df=df.head(5)
temp_df.head(3)
sentences=temp_df["text"]
#code for word embeddings
tokens =sentences.apply(nltk.word_tokenize)
print(tokens)
model = Word2Vec(tokens,min_count=1)
print(model)
words = list(model.wv.vocab)
print(words)
print(model['tinnitus'])

11649    [It, 's, like, my, tinnitus, is, on, steroids,...
11691    [Still, Searching, For, A, Tinnitus, Cure, ?, ...
11701    [Hi, John, ,, On, one, show, you, were, mentio...
11706    [I, added, a, video, to, a, @, YouTube, playli...
11798    [Still, Searching, For, A, Tinnitus, Cure, ?, ...
Name: text, dtype: object
Word2Vec(vocab=77, size=100, alpha=0.025)
['It', "'s", 'like', 'my', 'tinnitus', 'is', 'on', 'steroids', '.', 'Every', 'slight', 'movement', 'Had', 'it', 'a', 'couple', 'of', 'days', ',', 'if', 'anyone', 'can', 'shed', 'light', 'please', 'do', 'Still', 'Searching', 'For', 'A', 'Tinnitus', 'Cure', '?', 'Try', 'These', 'Ideas', '!', 'https', ':', '//t.co/njjWyQJmtA', '#', 'Hi', 'John', 'On', 'one', 'show', 'you', 'were', 'mentioning', 'cure', 'for', 're-iterate', 'what', 'help', 'should', 'seek', 'Thanks', 'I', 'added', 'video', 'to', '@', 'YouTube', 'playlist', '//t.co/3H9YLUbTCx', 'MOST', 'POWERFUL', 'TINNITUS', 'SOUND', 'THERAPY', '1', 'Hr|Tinnitus', 'Treatment', 'Ringi

  # This is added back by InteractiveShellApp.init_path()


In [25]:
df.to_csv("out.csv")

In [26]:
treatment_counts = Counter()
word_counts = Counter()
for row in df.itertuples():
    treatment_counts += Counter(row[5])
    word_counts += Counter(row[6])