In [None]:
import re
import pandas as pd
import preprocessor as p
import sys
sys.path.append("/Library/Python/2.7/site-packages")
from wordcloud import WordCloud
from nltk.probability import FreqDist, ConditionalFreqDist
from textblob import TextBlob   
import nltk
import string
from tqdm import tqdm
from nltk import ngrams
pd.set_option('display.max_colwidth', -1)

In [None]:
f = open("tweets_1.txt")

In [None]:
data = f.readlines()
clean_data = []
for line in data:
    if line[:2] == "79":
        clean_data.append(line.strip())
    else:
        temp = clean_data[-1] + " " +  line.strip()
        clean_data[-1]  = temp

In [None]:
df = []
for tweet in clean_data:
    try:
        if tweet[:2] == "79":
            df.append({'tweet_id': tweet[:18], 'text': tweet[19:]})
    except:
        pass

df = pd.DataFrame(df)
df.head()

In [None]:
all_hashtags = {}
def parse_tweets(tweet):
    parsed_tweet = p.parse(tweet.decode('ascii', 'ignore').encode('ascii').lower())
    parsed_hashtags = parsed_tweet.hashtags
    
    hashtags = []
    if parsed_hashtags is not None:
        for hashtag in parsed_hashtags:
            temp = hashtag.match[1:].lower()
            if temp in all_hashtags:
                all_hashtags[temp] += 1
            else:
                all_hashtags[temp] = 1
            hashtags.append(temp)
    
    clean_tweet = p.clean(tweet)
    hashtags_str = (" ").join(hashtags)
    return clean_tweet, hashtags_str, len(hashtags)

In [None]:
df['tweet'], df['hashtags'], df['length'] = zip(*df['text'].map(parse_tweets)) 
df = df.drop_duplicates("text")

Wordcloud for all the secondary hashtags

In [None]:
wc = WordCloud().generate_from_frequencies(all_hashtags)

import matplotlib.pyplot as plt
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# concepts = pd.DataFrame(all_hashtags.items(), columns=['topic', 'n'])
concept_dict = {}
for k, d in enumerate(all_hashtags):
    concept_dict[d] = k

In [None]:
tweets = df['text'].tolist()
print('# of tweets:', len(tweets))
for tweet in tweets[:5]:
    print(tweet)

Create hashtags based term document matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

tweet_hashtags = df['hashtags'].tolist()
vectorizer = TfidfVectorizer(max_features = 20000, use_idf=False)
tf_vectors = vectorizer.fit_transform(tweet_hashtags)

svd = TruncatedSVD(n_components=50, random_state=0)
svd_tf_vectors = svd.fit_transform(tf_vectors)

Load glove vectors, tokenize tweets 

In [None]:
import numpy as np
print("loading glove model...")
embedding_size = 100
glove_file = 'glove.twitter.27B.100d.txt'
glove = {}
with open(glove_file) as f:
    for line in f.readlines():
        line = line.replace("\n","").split(" ")
        glove[line[0]] = np.array(line[1:],dtype='float64')

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def tweetVector(tweet):
    words = tknzr.tokenize(tweet.lower().replace("#"," "))                       
    l = float(len(tweet) | 1)
    sum = np.zeros(embedding_size)
    for word in tweet:
        sum += glove.get(word, np.zeros(embedding_size))
    return sum/l

In [None]:
tweet_vectors = []
for i in range(25000):
    tweet_vectors.append(tweetVector(tweets[i]))

In [None]:
tweet_vectors = np.array(tweet_vectors)

Use Denoising AutoEncoders to create representation vectors

In [None]:
from DenoisingAutoencoder import DenoisingAutoencoder
from StackedDenoisingAutoencoders import StackedDenoisingAutoencoders

da = DenoisingAutoencoder(n_hidden=400, verbose=True, training_epochs=5)
da.fit(tweet_vectors)

In [None]:
tweet_vectors = da.transform_latent_representation(tweet_vectors)

In [None]:
from sklearn.cluster import MiniBatchKMeans

num_clusters = 20
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(tf_vectors)
kmeans_clusters = kmeans.predict(tf_vectors)
kmeans_distances = kmeans.transform(tf_vectors)

for i, tweet in enumerate(tweets):
    if(i < 15):
        print("Cluster " + str(kmeans_clusters[i]) + ": " + tweet + "(distance: " + str(kmeans_distances[i][kmeans_clusters[i]]) + ")")

Calculating Collocations

In [None]:
def get_score(bigram, wfd, bfd, n_xx):
    n_ix = wfd[bigram[0]]
    n_xi = wfd[bigram[1]]
    n_ii = bfd[bigram]
    n_oi = n_xi - n_ii
    n_io = n_ix - n_ii
    n_oo = n_xx - n_ii - n_oi - n_io
    score = (float(n_ii*n_oo - n_io*n_oi)**2 /
                ((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)))
    return score

tknzr = nltk.TweetTokenizer()

wfd = FreqDist()
bfd = FreqDist()
stopwords = nltk.corpus.stopwords.words('english')

for i in tqdm(range(len(tweets))):
    tweet = "".join(l for l in tweets[i] if l not in string.punctuation)
    data_tokens = (tknzr.tokenize(tweet.lower()))
    tokens = [w for w in data_tokens if w.lower() not in stopwords]

    for window in ngrams(tokens, 2, pad_right=True):
        w1 = window[0]
        if w1 is None:
            continue
        wfd[w1] += 1
        for w2 in window[1:]:
            if w2 is not None:
                bfd[(w1, w2)] += 1
            
after_filter_bigrams = FreqDist()
for words, freq in bfd.iteritems():
    if not (freq < 3):
        after_filter_bigrams[words] = freq

score_FD = FreqDist()
for bigram in  after_filter_bigrams:
    score_FD[bigram] = get_score(bigram, wfd, after_filter_bigrams, after_filter_bigrams.N())

In [None]:
def compute_distance(w1, w2):
    if(w1, w2) in score_FD:
        return score_FD[(w1, w2)]

words = all_hashtags.keys()
words = np.asarray(words) #So that indexing with a list will work
lev_similarity = -1*np.array([[compute_distance(w1,w2) for w1 in words] for w2 in words])

Create Skip vectors

In [None]:
sys.path.append("skip-thoughts")
import skipthoughts
import numpy as np

model = skipthoughts.load_model()
encoder = skipthoughts.Encoder(model)
skip_vectors = []

for tweet in tweets:    
    try:
        tweet = tweet.decode('ascii', 'ignore').encode('ascii').lower()
        skip_vectors.append(encoder.encode(tweet, verbose=0))
    except:
        skip_vectors.append(np.zeros(4800))

Clustering 

In [None]:
from sklearn.metrics import adjusted_mutual_info_score
import rcc
clusterer = rcc.RccCluster(measure='cosine')

P = clusterer.fit(svd_tf_vectors[:25000])

LDA

In [None]:
import lda
from sklearn.feature_extraction.text import CountVectorizer
tweets = [tweet.lower().replace("#"," ") for tweet in tweets[:25000]]
cvectorizer = CountVectorizer(min_df=4, max_features=10000, stop_words='english')
cvz = cvectorizer.fit_transform(tweets)

n_topics = 15
n_iter = 2000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

In [None]:
n_top_words = 8
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Sentiment Analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_sentiment_score(tweet):
    ss = sid.polarity_scores(tweet)
    return ss["pos"], ss["neg"], ss["neu"]

df['pos_score'], df['neg_score'], df['neu_score'] = zip(*df['text'].map(get_sentiment_score)) 
df.head()

In [None]:
from langdetect import detect

def language_detect(tweet):
    return detect(tweet)

df['lang'] = df[df['tweet'].str.len() > 3]['tweet'].map(language_detect)