# Tag Central
Here we will manipulate data from the data folder(US Election tags and Twitch plays pokemon tags).


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AffinityPropagation
import numpy as np
import pandas as pd
import distance

In [2]:
twitch_data = pd.read_csv("data/Twitch Plays Pokemon Identifiers.csv", low_memory=False, encoding = "ISO-8859-1")
election_data = pd.read_csv("data/US Election Identifiers.csv", low_memory=False, encoding = "ISO-8859-1")

DATA = twitch_data

Twitch data and election data are loaded using panda.
Each dataset has two columns **Identifier** and **Subject**
The __tokenize_tags__ function below takes each row of tags, splits them up into arrays and puts them all together into a tags array.

In [3]:
def tokenize_tags(data):
    tags = data['Subject']
    all_tags = []
    for tag_string in tags:
        tag_string = str(tag_string)
        all_tags.extend(tag_string.split(","))
    all_tags = np.asarray(all_tags)
    return all_tags

TAGS = tokenize_tags(DATA)
print(list(TAGS[0:10]))
print("Total number of tags", len(TAGS))

['twitch', 'irc', 'twitch plays pokÃ©mon', 'tpp', 'pokÃ©mon', 'pokemon', 'pokemon red', 'pokÃ©mon red', 'red', 'nan']
Total number of tags 302


In [4]:
def cleanup(word):  
    if (word == None):
            print("Fingerprint keyer accepts a single string parameter")
    # remove whitespace around the string
    word = word.strip()
    # lowercase the string
    word = word.lower()
    # remove all punctuation and control chars, per https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
    word = ''.join(e for e in word if e.isalnum())
    # finds ASCII equivalent, per https://stackoverflow.com/questions/21701968/python-the-standard-library-ascii-function
    # not sure if the ascii() function is needed, leaving out to test
    word = ascii(word)
    # splits the word by whitespace, per https://www.tutorialspoint.com/python/string_split.htm
    word = word.split()
    #sort words
    word = sorted(word)
    # removes duplicates
    word = "".join(list(set(word)))
    # sorts array in place 
    return word.strip("'")

In [5]:
cleaned_tags = [cleanup(tag) for tag in TAGS]

In [6]:
tfidf_vectorizer=TfidfVectorizer()
tfidf_matrix=tfidf_vectorizer.fit_transform(cleaned_tags)
cs_similarity = np.array([cosine_similarity(tfidf_matrix[i:i+1],tfidf_matrix).flatten() for i in range(len(cleaned_tags))])

In [7]:
def cluster(data, tags, clean_tags):
    affprop = AffinityPropagation(preference=100)
    affprop.fit(data)
    clustered_tags = {}
    for cluster_id in np.unique(affprop.labels_):
        exemplar = cleaned_tags[affprop.cluster_centers_indices_[cluster_id]].lower()
        if exemplar in list(clustered_tags.keys()):
            arr = clustered_tags[exemplar]
        else:
            arr = []
        cluster = np.unique(tags[np.nonzero(affprop.labels_==cluster_id)])
        arr.extend(cluster.tolist())
        clustered_tags[exemplar] =  list(set(arr))
        cluster_str = ", ".join(cluster)
    print("No, of labels", len(clustered_tags.keys()))
    return clustered_tags

In [9]:
clustered_tags = cluster(cs_similarity, TAGS, cleaned_tags)
#clustered_tags = cluster(TAGS.reshape(-1, 1), TAGS, cleaned_tags)
len(clustered_tags.keys())

No, of labels 120


120

## Inverted Index
Here we are preparing an inverted index of our tags and identifiers

First, we convert the dataframe to a dictionary. The key is the identifier and the  the value is a string of comma separated tags.
The **make_inverted_index** function that converts this dictionary into a dictionary where the key is a tag and the value is a list of documents where is occurs. The documents are labelled by their position. e.g. 0,1,2,3. This is much easier to work with than their longer values e.g. live_user_twitchplayspokemon_1407024801

We create the popularity index and inverted index at the same time.

In [10]:
# Run this just once.this converts the csv data into a dictionary
DATA = DATA.set_index('Identifier').T.to_dict('list') 

In [11]:
def make_inverted_index(data, clustered_tags):
    inverted_index = {}
    popularity_index = {}
    for i, doc in enumerate(data):
        doc_tags = str(data[doc][0]).split(",")
        for tag in doc_tags:
            for clustered_tag in clustered_tags:
                arr = clustered_tags.get(clustered_tag, None)
                if arr:  arr.append(clustered_tag)
                else:continue
                if tag in arr:
                    if inverted_index.get(tag, None):
                        inverted_index[tag].append(i)
                        popularity_index[tag] += 1
                    else:
                        inverted_index[tag] = [i]
                        popularity_index[tag] = 1
                    break
    return inverted_index, popularity_index
                     
inverted_index, popularity_index = make_inverted_index(DATA, clustered_tags)
#print("Inverted index for trump", inverted_index["trump"])
#print("Popularity index for trump", popularity_index["trump"])
print(inverted_index)

{'twitch': [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 29], 'irc': [0], 'twitch plays pokÃ©mon': [0], 'tpp': [0, 3, 18, 26, 28, 29, 30], 'pokÃ©mon': [0], 'pokemon': [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 38], 'pokemon red': [0], 'pokÃ©mon red': [0], 'red': [0], 'nan': [1], 'twitch plays pokemon': [2, 6, 18, 27, 31], 'screenshot': [3, 30], 'twitchplayspokemon': [3, 7, 15, 16, 17, 20, 23, 24, 25, 26, 27, 28, 30, 31, 33], 'democracy': [3, 30], 'emulator': [3, 4, 5, 7, 8, 9, 10, 12, 14, 15, 16, 17, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30], 'save file': [4, 5, 7, 8, 9, 10, 12, 14, 15, 16, 17, 19, 20, 21, 23, 24, 25, 27], 'student': [11], 'radio': [11], 'nerd': [11], 'geek': [11, 45], 'belfield': [11], 'fm': [11], 'ucd': [11], 'dublin': [11], 'ireland': [11], 'Gaming': [13, 37, 42], 'youtube': [13], 'GameTrailers': [13], 'moemon': [15], 'dec3199': [22], 'live': [22], 'comments': [22], 'n

In the following function we use cosine similarity to find similar tags.

In [14]:
def get_similar_tags(query, inverted_index):
    query = query.lower()
    array_of_docs = []
    array_of_tags = []
    first_loop = True
    for key in inverted_index:
        row = ",".join([ str(x) for x in inverted_index[key]])
        array_of_docs.append(row)
        array_of_tags.append(key.lower())
    array_of_docs = np.array(array_of_docs)  
    if query not in array_of_tags: return []
    t = array_of_tags.index(query)
    
    tfidf = TfidfVectorizer().fit_transform(array_of_docs)
    cosine_similarities = cosine_similarity(tfidf[t:t + 1], tfidf).flatten()
    most_similar_tags = cosine_similarities.argsort()[:-6:-1]
    #print("most_similar_tags", most_similar_tags)
    #for t in most_similar_tags:
        #print(array_of_docs[t:t+1])
    similar_tags = [array_of_tags[i] for i in most_similar_tags]
    return similar_tags[1:]

get_similar_tags("twitch", inverted_index)

['save file', 'emulator', 'pokemon', 'twitchplayspokemon']

In [17]:
get_similar_tags("twitchplayspokemon", inverted_index)

['pokemon', 'emulator', 'save file', 'twitch']

In [105]:
get_similar_tags("election", inverted_index)

[]

In [106]:
get_similar_tags("Youtube", inverted_index)

['gametrailers', 'gaming', 'oqt', 'bort']