# Tag Central
Here we will manipulate data from the data folder(US Election tags and Twitch plays pokemon tags).


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AffinityPropagation
import numpy as np
import pandas as pd
import distance

In [22]:
twitch_data = pd.read_csv("data/Twitch Plays Pokemon Identifiers.csv", low_memory=False, encoding = "ISO-8859-1")
election_data = pd.read_csv("data/US Election Identifiers.csv", low_memory=False, encoding = "ISO-8859-1")

DATA = election_data

Twitch data and election data are loaded using panda.
Each dataset has two columns **Identifier** and **Subject**
The __tokenize_tags__ function below takes each row of tags, splits them up into arrays and puts them all together into a tags array.

In [23]:
def tokenize_tags(data):
    tags = data['Subject']
    all_tags = []
    for tag_string in tags:
        tag_string = str(tag_string)
        all_tags.extend(tag_string.split(","))
    all_tags = np.asarray(all_tags)
    return all_tags

TAGS = tokenize_tags(DATA)
print(list(TAGS[0:10]))
print("Total number of tags", len(TAGS))

['Youtube', 'video', 'Entertainment', 'Superbocky', 'Ran', 'ran', 'ruu', 'Mcroll', 'Japanese', 'Ronald']
Total number of tags 45007


## Inverted Index
Here we are preparing an inverted index of our tags and identifiers

First, we convert the dataframe to a dictionary. The key is the identifier and the  the value is a string of comma separated tags.
The **make_inverted_index** function that converts this dictionary into a dictionary where the key is a tag and the value is a list of documents where is occurs. The documents are labelled by their position. e.g. 0,1,2,3. This is much easier to work with than their longer values e.g. live_user_twitchplayspokemon_1407024801

We create the popularity index and inverted index at the same time.

In [24]:
# Run this just once.this converts the csv data into a dictionary
DATA = DATA.set_index('Identifier').T.to_dict('list') 

In [87]:
def cleanup(word):  
    if (word == None):
            print("Fingerprint keyer accepts a single string parameter")
    # remove whitespace around the string
    word = word.strip()
    # lowercase the string
    word = word.lower()
    # remove all punctuation and control chars, per https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
    word = ''.join(e for e in word if e.isalnum())
    # finds ASCII equivalent, per https://stackoverflow.com/questions/21701968/python-the-standard-library-ascii-function
    # not sure if the ascii() function is needed, leaving out to test
    #word = ascii(word)
    # splits the word by whitespace, per https://www.tutorialspoint.com/python/string_split.htm
    word = word.split()
    # removes duplicates
    word = "".join(list(set(word)))
    # sorts array in place 
    return word

In [88]:
def make_inverted_index(data):
    inverted_index = {}
    popularity_index = {}
    for i, doc in enumerate(data):
        doc_tags = str(data[doc][0]).split(",")
        for tag in doc_tags:
            if inverted_index.get(tag, None):
                inverted_index[tag].append(i)
                popularity_index[tag] += 1
            else:
                inverted_index[tag] = [i]
                popularity_index[tag] = 1
    return inverted_index, popularity_index

In [105]:
def make_inverted_index_clean(data):
    inverted_index = {}
    popularity_index = {}
    for i, doc in enumerate(data):
        doc_tags = str(data[doc][0]).split(",")
        for tag in doc_tags:
            tag = cleanup(tag) #CLEANUP OCCURS HERE
            if inverted_index.get(tag, None):
                inverted_index[tag].append(i)
                popularity_index[tag] += 1
            else:
                inverted_index[tag] = [i]
                popularity_index[tag] = 1
    return inverted_index, popularity_index

In [106]:
#Generate clean and dirty indices
inverted_index, popularity_index = make_inverted_index(DATA)
inverted_index_clean, popularity_index_clean = make_inverted_index_clean(DATA)

In [107]:
def recommendOriginal(invIndex, invIndexClean, query):
    """Take an input of a partial string (e.g. 'Tru').
    Clean input.
    Access inverted index to find matching (cleaned) tags.
    Figure out which matching cleaned tags are most popuar.
    Recommend uncleaned version of most popular tag."""
    cleanQuery = cleanup(query)
    matchList = []
    #Find all clean tags that match the clean query
    for tag in invIndexClean:
        if cleanQuery in tag:
            matchList.append([tag, len(invIndexClean[tag])])
    #print("matchList:", matchList)
    #find most popular tag in matchList
    highest = 0
    topRecClean = ""
    for (tag, popularity) in matchList:
        #print("tag:", tag, "popularity:", popularity)
        if popularity > highest:
            highest = popularity
            topRecClean = tag
    #print(topRecClean)
    #print(highest)
    
    #What is original version of topRec with the highest popularity?
    #Data is dictionary of element:tags
    #Look in the dirty index
    #Find the multiple tags that clean to cleanquery
    #Recommend the most popular one
    dirtyMatches = []
    topRecDirty = ""
    highestDirty = 0
    
    for tag in invIndex:
        if topRecClean == cleanup(tag):
            dirtyMatches.append([tag, len(invIndex[tag])])
    
    #print("Dirty matches:", dirtyMatches)
    for (tag, popularity) in dirtyMatches:
        if popularity > highestDirty:
            highestDirty = popularity
            dirtyRec = tag
    return dirtyRec

In [108]:
print(recommendOriginal(inverted_index, inverted_index_clean, 'hillary'))

Hillary Clinton


In the following function we use cosine similarity to find similar tags.

In [135]:
def get_similar_tags(tag, inverted_index, inverted_index_clean, numRec):
    tag = cleanup(tag)
    array_of_docs = []
    array_of_tags = []
    first_loop = True
    for key in inverted_index_clean:
        row = ",".join([ str(x) for x in inverted_index_clean[key]])
        array_of_docs.append(row)
        array_of_tags.append(key)
    array_of_docs = np.array(array_of_docs)  
    if tag not in array_of_tags: return []
    t = array_of_tags.index(tag)
    
    tfidf = TfidfVectorizer().fit_transform(array_of_docs)
    cosine_similarities = cosine_similarity(tfidf[t:t + 1], tfidf).flatten()
    most_similar_tags = cosine_similarities.argsort()[:-(numRec + 1):-1]
    #print("most_similar_tags", most_similar_tags)
    #for t in most_similar_tags:
        #print(array_of_docs[t:t+1])
    similar_tags = [array_of_tags[i] for i in most_similar_tags]
    similar_Original_tags = []
    for tag in similar_tags:
        similar_Original_tags.append(recommendOriginal(inverted_index, inverted_index_clean, tag))
    return similar_Original_tags[0:]

get_similar_tags("president", inverted_index, inverted_index_clean, 10)

['President',
 'Election',
 'terrorists',
 'Hillary Clinton Campaign',
 'Bernie Sanders Voters',
 'money laundering',
 'Stolen Election',
 "I'm with Her",
 'terror funding',
 'america']

In [140]:
get_similar_tags("Hillary Clinton", inverted_index, inverted_index_clean, 10)

['Hillary Clinton',
 'Donald Trump',
 'Infowars',
 'Alex Jones',
 'News & Politics',
 'video',
 'Donald Trump',
 'Youtube',
 'Bernie Sanders',
 'Benghazi']

In [131]:
get_similar_tags("election", inverted_index, inverted_index_clean, 10)

['Election',
 'Hillary Clinton',
 'Hillary Clinton',
 'Donald Trump',
 'Infowars',
 'News & Politics',
 'Alex Jones',
 'Youtube',
 'video',
 'Hack']

In [102]:
get_similar_tags("election", inverted_index, inverted_index_clean)

['President', 'presidential election', 'activist', 'voters lists']

In [9]:
get_similar_tags("Youtube", inverted_index)

['video', 'News & Politics', '2016', 'PEG']

In [133]:
get_similar_tags("youtube", inverted_index, inverted_index_clean, 10)

['Youtube',
 'video',
 'News & Politics',
 '2016',
 'PEG',
 'Community Media',
 'Public Access TV',
 'Infowars',
 'Alex Jones',
 'Hillary Clinton']

In [141]:
get_similar_tags("trump", inverted_index, inverted_index_clean, 10)

['Donald Trump',
 'Hillary Clinton',
 'Donald Trump',
 'Donald Trump',
 'Hillary Clinton',
 'News & Politics',
 'Infowars',
 'obama',
 'Election',
 'video']

In [None]:
# 
# Trump returns different results than trump
