# Tag Central
Here we will manipulate data from the data folder(US Election tags and Twitch plays pokemon tags).


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AffinityPropagation
import numpy as np
import pandas as pd
import distance

In [2]:
twitch_data = pd.read_csv("data/Twitch Plays Pokemon Identifiers.csv", low_memory=False, encoding = "ISO-8859-1")
election_data = pd.read_csv("data/US Election Identifiers.csv", low_memory=False, encoding = "ISO-8859-1")

DATA = election_data

Twitch data and election data are loaded using panda.
Each dataset has two columns **Identifier** and **Subject**
The __tokenize_tags__ function below takes each row of tags, splits them up into arrays and puts them all together into a tags array.

In [3]:
def tokenize_tags(data):
    tags = data['Subject']
    all_tags = []
    for tag_string in tags:
        tag_string = str(tag_string)
        all_tags.extend(tag_string.split(","))
    all_tags = np.asarray(all_tags)
    return all_tags

TAGS = tokenize_tags(DATA)
print(list(TAGS))
print("Total number of tags", len(TAGS))

Total number of tags 45007


## Clustering
The levenshtein distance calculates how similar words are to each other based on how many steps it would take to convert one
word to the other using deletion, and creation of new characters. This method is not as effective as cosine similarity. It's also very slow.

In [4]:
lev_similarity = -1 * np.array([[distance.levenshtein(t1.lower(),t2.lower()) for t1 in TAGS] for t2 in TAGS])
lev_similarity

array([[  0,  -4, -15, ...,  -7,  -8,  -7],
       [ -4,   0, -19, ...,  -6,  -6,  -6],
       [-15, -19,   0, ..., -18, -18, -18],
       ..., 
       [ -7,  -6, -18, ...,   0,  -6,   0],
       [ -8,  -6, -18, ...,  -6,   0,  -6],
       [ -7,  -6, -18, ...,   0,  -6,   0]])

In [6]:
print(TAGS[0])
print(lev_similarity[0])
print(TAGS[10])

twitch
[  0  -4 -15  -5  -8  -7 -11 -12  -6  -6   0 -14  -7  -5  -7  -9 -12  -8
  -7   0   0  -7  -9  -7   0  -7  -9  -7 -14   0  -7  -9  -7 -12   0  -7
  -9  -7   0  -7  -9  -7   0  -7  -9  -7  -7   0  -6  -6  -6  -6  -7  -6
  -5  -6  -7   0  -7  -9  -7  -6  -6 -10   0  -7  -9  -7   0  -7  -9  -7
 -12  -6   0  -7  -9  -7 -12   0  -7  -9  -7 -12 -14  -5  -7   0  -7  -9
  -7   0  -7  -9  -7 -12   0  -7  -9  -7  -7   0  -5  -8  -7  -7 -10  -6
  -5   0  -7  -9  -7 -12   0  -7  -9  -7 -12  -9   0  -7  -9  -7 -12  -5
 -12  -5  -7  -6 -12  -7  -6 -12 -14  -7  -7  -9 -12 -12  -5  -6 -12  -7
  -3  -5  -7  -5   0  -6  -7  -5 -10  -7  -9  -6  -5 -12  -8  -7  -5 -12
 -14  -6  -7 -12 -12  -7   0 -10 -11 -30 -12 -12  -7   0 -10 -11 -30 -12
  -7   0 -10 -11 -30 -12  -7   0 -10 -11 -30  -6  -5  -6 -20 -13 -18 -19
 -15 -15 -15 -26 -17 -20 -20 -28  -7 -36 -15 -16 -20 -27 -11  -6  -5 -14
 -43 -19 -24 -16 -18 -19 -16 -10 -11 -16  -7 -39 -19 -12 -12 -11 -18 -14
 -19 -11  -5  -7  -6 -11  -5  -7  -6 -14  -7

Here we use TFIDF vectorization to convert words to numbers and use cosine similarity function to determine how similar words are to each other. It is quick but not perfect as seen below.

In [None]:
tfidf_vectorizer=TfidfVectorizer()
tfidf_matrix=tfidf_vectorizer.fit_transform(TAGS)
cs_similarity = np.array([cosine_similarity(tfidf_matrix[i:i+1],tfidf_matrix).flatten() for i in range(len(TAGS))])

print(cs_similarity[0])
print(type(cs_similarity[0]))
words = [TAGS[i] for i,x in enumerate(cs_similarity[0]) if x > 0.5 ]
print(len(words))
print(words)

This function uses Affinity Propagation to cluster words that are most similar together. It outputs a dictionary that looks like this :

{ 'Most popular tag called exemplar' : [ tags similar to exemplar comma separated ] }

We can explore other clustering algorithms as well.

In [4]:
def cluster(data, tags):
    affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(data)
    clustered_tags = {}
    print(len(affprop.labels_))
    print(affprop.labels_)
    print(len(affprop.cluster_centers_indices_))
    print(np.unique(affprop.labels_))
    for cluster_id in np.unique(affprop.labels_):
        exemplar = tags[affprop.cluster_centers_indices_[cluster_id]].lower()
        if exemplar in list(clustered_tags.keys()):
            arr = clustered_tags[exemplar]
        else:
            arr = []
        cluster = np.unique(tags[np.nonzero(affprop.labels_==cluster_id)])
        arr.extend(cluster.tolist())
        clustered_tags[exemplar] =  list(set(arr))
        cluster_str = ", ".join(cluster)
    print("No, of labels", len(clustered_tags.keys()))
    return clustered_tags

Cosine similarity works a little better than levenshtein distaance in predicting more similar words.
We can explore other ways of doing this.

In [5]:
#clustered_tags  = cluster(lev_similarity)
clustered_tags = cluster(cs_similarity, TAGS)
clustered_tags

NameError: name 'cs_similarity' is not defined

## Popularity Index
Here we will prepare a popularity index dictionary that will map each (exemplar) tag to a count. The count indicates how many documents have that tag. 
This will be used in autocompletion

In [64]:
def popularity(clustered_tags, all_tags):
    popularity_index = {}
    for exemplar in clustered_tags:
        count = 0
        arr = clustered_tags[exemplar]
        arr.append(exemplar)
        for tag in all_tags:
            if tag and tag in arr:
                count +=1
        popularity_index[exemplar] = count
    return popularity_index
                
popularity_index = popularity(clustered_tags, TAGS)
popularity_index

{'3 hit combo podcast': 3,
 'anime': 3,
 'battle': 2,
 'belfield': 1,
 'chroma': 2,
 'comedy': 3,
 'democracy': 2,
 'emulator': 38,
 'funny': 6,
 'gametrailers': 1,
 'gaming': 5,
 'geek': 2,
 'jolly swag men': 3,
 'live stream': 5,
 'nerds': 3,
 'news': 3,
 'nintendo': 1,
 'pinball': 2,
 'podcast': 10,
 'pokemon': 49,
 'pokemon x and y': 3,
 'pokã©mon': 9,
 'pokã©mon trading card game online': 5,
 'projectrevotpp': 4,
 'radio': 1,
 'red': 3,
 'rune factory 4': 1,
 'save file': 28,
 'screenshot': 4,
 'sesame street': 2,
 'sgm': 2,
 'technology': 3,
 'television': 2,
 'titanfall': 3,
 'tpp': 7,
 'twitch': 38,
 'twitchplayspokemon': 16,
 'ucd': 1,
 'video games': 15,
 'vito gesualdi': 5,
 'vitozone': 5,
 'youtube': 3}

## Inverted Index
Here we are preparing an inverted index of our tags and identifiers

First, we convert the dataframe to a dictionary. The key is the identifier and the  the value is a string of comma separated tags.
The **make_inverted_index** function that converts this dictionary into a dictionary where the key is the (exemplar) tag and the value is a list of documents where is occurs. The documents are labelled by their position. e.g. 0,1,2,3. This is much easier to work with than their longer values e.g. live_user_twitchplayspokemon_1407024801

In [21]:
type(DATA)
len(DATA.keys())
DATA.keys()
DATA = DATA.set_index('Identifier').T.to_dict('list')
DATA['tppvietcrystal']

['twitchplayspokemon,tpp,pokemon,revo,projectrevotpp,emulator,sgm']

In [66]:
def make_inverted_index(data, clustered_tags):
    inverted_index = {}
    for i, doc in enumerate(data):
        doc_tags = str(data[doc][0]).split(",")
        for j, exemplar in enumerate(clustered_tags):
            arr = clustered_tags[exemplar]
            arr.append(exemplar)
            for tag in doc_tags:
                if tag in arr:
                    if inverted_index.get(exemplar, None):
                        inverted_index[exemplar].append(i)  
                    else:
                        inverted_index[exemplar] = [i]
    return inverted_index
                     
inverted_index = make_inverted_index(DATA, clustered_tags)
inverted_index

{'3 hit combo podcast': [28, 42, 44],
 'anime': [11, 40, 41],
 'battle': [22, 38],
 'belfield': [11],
 'chroma': [11, 46],
 'comedy': [22, 42, 44],
 'democracy': [3, 30],
 'emulator': [1,
  3,
  4,
  5,
  7,
  8,
  9,
  10,
  11,
  11,
  12,
  14,
  15,
  16,
  17,
  19,
  20,
  21,
  22,
  23,
  24,
  24,
  25,
  26,
  26,
  27,
  28,
  28,
  29,
  30,
  30,
  30,
  30,
  38,
  39,
  39,
  42,
  48],
 'funny': [37, 37, 37, 37, 42, 44],
 'gametrailers': [13],
 'gaming': [13, 37, 42, 44, 46],
 'geek': [11, 45],
 'jolly swag men': [48, 48, 48],
 'live stream': [22, 32, 34, 35, 36],
 'nerds': [40, 41, 46],
 'news': [42, 44, 45],
 'nintendo': [22],
 'pinball': [43, 47],
 'podcast': [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
 'pokemon': [0,
  2,
  3,
  4,
  5,
  7,
  8,
  9,
  10,
  11,
  12,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  30,
  31,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  37,
  38,
  38,
  38,
  38,
  38,
  38,
  38,
  38

In [67]:
def get_similar_tags(tag, clustered_tags, inverted_index):
    array_of_exemplars = [key for key in clustered_tags]
    if tag.lower() not in array_of_exemplars:
        return []
    index  = array_of_exemplars.index(tag.lower())
    array_of_docs = [','.join(str(doc) for doc in inverted_index[exemplar]) for exemplar in array_of_exemplars]
    tfidf = TfidfVectorizer().fit_transform(array_of_docs)
    cosine_similarities = cosine_similarity(tfidf[index:index+1], tfidf).flatten()
    most_similar_tags = cosine_similarities.argsort()[:-5:-1]
    similar_tags = [array_of_exemplars[i] for i in most_similar_tags]
    return similar_tags[1:]

In [68]:
get_similar_tags("pokemon", clustered_tags, inverted_index)

['pokã©mon', 'youtube', 'funny']

Using the andSearch from homework 4
    

In [69]:
get_similar_tags("emulator", clustered_tags, inverted_index)

['screenshot', 'twitchplayspokemon', 'save file']

In [107]:
DATA = DATA.set_index('Identifier').T.to_dict('list')

In [108]:
def cleanup(word):
    return word.lower()

def make_inverted_index(data):
    inverted_index = {}
    popularity_index = {}
    for i, doc in enumerate(data):
        doc_tags = str(data[doc][0]).split(",")
        for tag in doc_tags:
            if inverted_index.get(tag, None):
                inverted_index[tag].append(i)
                popularity_index[tag] += 1
            else:
                inverted_index[tag] = [i]
                popularity_index[tag] = 1
    return inverted_index, popularity_index
                     
inverted_index, popularity_index = make_inverted_index(DATA)
print(inverted_index)
print(popularity_index)

554


In [112]:
def get_similar_tags(tag, inverted_index):
    array_of_docs = []
    array_of_tags = []
    first_loop = True
    for key in inverted_index:
        row = ",".join([ str(x) for x in inverted_index[key]])
        array_of_docs.append(row)
        array_of_tags.append(key)
    array_of_docs = np.array(array_of_docs)  
    if tag not in array_of_tags: return []
    t = array_of_tags.index(tag)
    
    tfidf = TfidfVectorizer().fit_transform(array_of_docs)
    cosine_similarities = cosine_similarity(tfidf[t:t + 1], tfidf).flatten()
    most_similar_tags = cosine_similarities.argsort()[:-5:-1]
    print("most_similar_tags", most_similar_tags)
    for t in most_similar_tags:
        print(array_of_docs[t:t+1])
    similar_tags = [array_of_tags[i] for i in most_similar_tags]
    return similar_tags[1:]

get_similar_tags("president", inverted_index)

most_similar_tags [ 409   17 7909 4363]
[ '30,161,297,326,330,364,366,375,391,404,423,445,925,995,1000,1041,1126,1155,1205,1232,1423,1439,1470']
[ '1,39,44,65,91,116,160,185,186,210,215,283,292,294,297,305,306,321,325,326,328,329,335,339,341,343,350,353,356,359,364,370,375,376,377,378,381,383,385,387,391,392,397,400,423,440,446,456,473,492,524,548,562,759,864,896,897,898,912,925,933,941,954,955,963,968,971,975,976,977,992,995,996,998,1000,1028,1032,1041,1044,1070,1081,1126,1131,1136,1155,1164,1174,1190,1205,1214,1215,1228,1230,1232,1235,1332,1420,1423,1439,1466,1470,1472,1474,1528,1574,1581,1584,1585,1597,1598,1599,1601']
['1423,1439']
['404,1439']


['election', 'politicians', 'trump rally']

{'twitch plays pokemon': [42, 44, 46, 39]}
