In [5]:
%%time
import numpy as np
from numpy.linalg import norm
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

n_tweets_to_read = 15000 # Choose the number of tweets to read

cosine_similarity = lambda a, b: np.inner(a, b) / norm(a) * norm(b) if norm(a) != 0.0 and norm(b) != 0.0 else 0.0

def TermDocumentMatrix(docs, docIDs=None):
    vectorizer = CountVectorizer(lowercase=True, stop_words=None)
    tdm = vectorizer.fit_transform(docs)
    tdm_feature_names = vectorizer.get_feature_names()
    #
    df = pd.DataFrame(tdm.toarray(), columns=tdm_feature_names, dtype="float64")
    if docIDs is not None:
        df.index = docIDs    
    return df

#Initialisation
ps = PorterStemmer()

# Read the data and remove duplicates if exist
tweets = []
with open("data/data.txt", encoding="utf-8") as file:
    for i, line in enumerate(file):
        if i < n_tweets_to_read:
            tweets.append(line)
        else:
            break  
tweets = list(set(tweets)) # delete duplicates
print("{} unique Tweets loaded\n".format(len(tweets)))

# Data Cleanning and prepocesing phase
cleanTweet = []
tweetIDs = []
tweetsProcessed = []
for tweet in tweets:
    try:
        doc = tweet.split("\t") 
        if len(doc)==7:
            cleanTweet.append(doc[5])
            tweetIDs.append(doc[0])     # add ID to a list
            doc = doc[5:6]   # remove everything that's note the tweet content      
            tok_doc = word_tokenize(" ".join(doc))    # tokenize remaining document
            stemmed_doc = [ps.stem(word) for word in tok_doc] 
            tweetsProcessed.append(" ".join(stemmed_doc))   #stemmed words
    except:
        pass

# TF-IDF construction part

# Terms frequency
tdf = TermDocumentMatrix(tweetsProcessed, tweetIDs)

# Document frequency
documentFrequencies = []
for index, series in tdf.iteritems(): 
    documentFrequencies.append(len(series.nonzero()[0])) 

# TF-IDF weight
tdf.applymap(lambda x: 1.0 + np.log10(x) if x > 0.0 else 0.0) # log frequency weight
idf = pd.Series(np.log10(len(tweets)/np.array(documentFrequencies))) # Inverse document frequency
tf_idf = tdf * idf.values

def printTopSimilarTweets(tf_idf=tf_idf,tweet='Hello', n=20):

    cleanT = tweet.split()   # tokenize remaining document
    vec = [ps.stem(word) for word in cleanT] 
    tf_idf=tf_idf.append(pd.Series(name='TestTweet'))
    tf_idf=tf_idf.fillna(0)
    for i in vec:
        try:
            tf_idf.loc['TestTweet', i] += 1
        except:
            pass
    a= tf_idf.loc['TestTweet']
    result = tf_idf.apply(lambda row: cosine_similarity(a, row), 
                          axis='columns').sort_values(ascending=False) 
    
    for i in range(0, n):
        try:
            print("{}: ".format(i+1) + cleanTweet[tweetIDs.index(result.index[i])] + "\n")
        except:
            pass

# output
printTopSimilarTweets(tweet="Hopefully the violence & unrest in Charlotte will come to an immediate end")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


14951 unique Tweets loaded





1: "Hopefully the violence & unrest in Charlotte will come to an immediate end. To those injured, get well soon. We need unity & leadership."

2: "Jeb Bush spent more than $40,000,000 in New Hampshire to come in 4 or 5, I spent $3,000,000 to come in 1st. Big difference in capability!"

3: Via @HuffPostPol by @_under_current: “Donald Trump Will End Outsourcing If President” http://www.huffingtonpost.com/lauren-windsor/donald-trump-will-end-outsourcing-if-president_b_7307426.html …

4: """Donald Trump ready to end @ApprenticeNBC for White House run"" http://www.washingtonexaminer.com/donald-trump-ready-to-end-apprentice-for-white-house-run/article/2560839 … via via @dcexaminer by @eScarry"

5: "Saudis just cut oil supply,making prices rise “immediately” while we are fighting ISIS for them http://www.bloomberg.com/news/2014-10-23/saudi-arabia-said-to-cut-crude-oil-supply-to-market-in-september.html?hootPostID=85412050974d86b20b6b79b103db2107 … What are we doing!"

6: The NFL should have i

In [6]:
printTopSimilarTweets(tweet="We will have to see what Russia's next move will be")

1: We will have to see what Russia's next move will be. They may have given him an out of an embarrassing situation or drove into deeper mess!

2: Phyllis Schlafly’s Eagle Forum: ‘National Review Will Be Defunct In The Next Year’http://www.breitbart.com/big-government/2016/01/25/exclusive-schlaflys-eagle-forum-national-review-will-defunct-next-year/ …

3: """@BWW_NYC: Major Jewish Newspaper to Pay Tribute to Joan Rivers & Donald Trump Next Month... http://www.broadwayworld.com/article/Major-Jewish-Newspaper-to-Pay-Tribute-to-Joan-Rivers-Donald-Trump-Next-Month-20150129 … @Joan_Rivers"

4: "Ford is MOVING jobs from Michigan to Mexico AGAIN! http://www.usatoday.com/story/money/cars/2015/07/09/ford-focus-cmax-mexico/29921307/ … As President, this will stop on Day One! Jobs will stay here."

5: "@MileyCyrus is on a very triky and slippery path right now.The right moves will lead to greatness, the wrong moves to oblivion! GUIDANCE."

6: """@CoachJMan: If the majority will bind together and 