# TF-IDF
In this notebook we include the key word extraction that is in the report.

In [1]:
import json 
import nltk
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
import numpy as np


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matij\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matij\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data = pd.read_csv("dataset/data.csv", sep = ";")
data.head()

Unnamed: 0,text,label
0,fucks sake go away stupid anon — ^ https://t....,abusive
1,Damn dean just put Corbin to sleep. That Match...,abusive
2,@TheRealCamerota THAT BEER BUYING FREAKING IDI...,abusive
3,what idiot called them antacids and not afterb...,abusive
4,RT @gogglepossum: Don't you hate people that p...,abusive


## Select the label for key word extraction here!!!

#### Available labels

In [4]:
set(data["label"].tolist())

{'abusive',
 'benevolent',
 'cyberbulling',
 'hate',
 'hateful',
 'identity',
 'insult',
 'obscene',
 'offensive',
 'profane',
 'racism',
 'sexism',
 'spam',
 'threat',
 'toxic'}

#### Label

In [5]:
label = "racism"

In [6]:
df = data[data["label"] == label]

labels = df["label"].tolist()
print(len(labels))

texts = df["text"].tolist()
print(len(texts))


148
148


#### Preprocessing our tweets by removing retweet text RT, hyperlinks, hashtags, taggs @, new lines, and zero length tweets

In [7]:
def preprocess(A, labels):
    B = []
    labels_new = []
    for i in range(len(A)):

        text = A[i]
        # remove old style retweet text "RT"
        text = re.sub(r'^RT[\s]+', '', text)

        # remove hyperlinks
        text= re.sub(r'https?:\/\/.*[\r\n]*', '', text)

        # remove hashtags
        # only removing the hash # sign from the word
        text = re.sub(r'#', '', text)

        # remove tagging @
        text = re.sub(r"(?:\@|https?\://)\S+", "", text)

        # remove new line \n
        text = re.sub("\n", "", text)

        # remove zero length tweets
        if(len(text) == 0 ):
            continue

        B.append(text)
        labels_new.append(labels[i])
    return B, labels_new

In [8]:
texts, labels = preprocess(texts, labels)
len(texts)

140

#### Merge lists into one large list of tweets. Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation).

In [9]:

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
"""
def tokenize(text):
    filtered_tokens = []
    tweet_tokens = tokenizer.tokenize(text)

    for token in tweet_tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

"""
def tokenize_only(text):
    # First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token.
    #tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = tokenizer.tokenize(text)
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation).
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens
stemmer = SnowballStemmer("english")


def tokenize_and_stem(text):
    #tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = tokenizer.tokenize(text)
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    #exclude stopwords from stemmed words
    stems = [stemmer.stem(t) for t in filtered_tokens]
    
    return stems



In [10]:
texts

['These girls are the equivalent of the irritating Asian girls a couple years ago. Well done, 7. MKR',
 "Drasko they didn't cook half a bird you idiot mkr",
 'Hopefully someone cooks Drasko in the next ep of MKR',
 "of course you were born in serbia...you're as fucked as A Serbian Film MKR",
 'So Drasko just said he was impressed the girls cooked half a chicken.. They cooked a whole one  MKR',
 '"He can\'t be a server at our restaurant, that beard makes him look like a terrorist." Everyone laughs. fuckthanksgiving',
 '   SANTA JUST *IS* WHITE',
 '  Why do people even talk about white privilege when the majority of food stamp recipients are white people!!',
 ' In his Ferguson testimony, Darren Wilson said he thought "it looks like a demon" before he shot Mike Brown. Ironic word…',
 ' I spent the morning at the Board of Elections getting maps/data to start registering every black person in Ferguson. ht…',
 ' Eric Holder from ferguson: "I understand that mistrust. I am the Attorney Genera

In [11]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in texts:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

# Create a pandas DataFrame
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

print("There are '{}' items in our data frame.".format(str(vocab_frame.shape[0])))
print("Data frame contents: \n{}".format(vocab_frame.head(10)))

There are '2111' items in our data frame.
Data frame contents: 
              words
these         these
girl          girls
are             are
the             the
equival  equivalent
of               of
the             the
irrit    irritating
asian         asian
girl          girls


#### TF-IDF

Add stopwords that often occur

In [12]:
stopword_list = stopwords.words("english")
stopword_list.extend(["u"])
stopword_list.extend(["i'v"])
stopword_list.extend(["you'v"])
stopword_list.extend(["you'r"])
stopword_list.extend(["i'm"])
stopword_list.extend(["mkr", "whi"])


print(stopword_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
# Define vectorizer parameters

tfidf_vectorizer = TfidfVectorizer(
                        max_df=0.60, 
                        max_features=None,
                        min_df=0.05,  
                        use_idf=True, 
                        stop_words= stopword_list,
                        tokenizer=tokenize_and_stem, 
                        ngram_range=(1,1))

#tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to synopses texts
%time tfidf_matrix = tfidf_vectorizer.fit_transform(texts) 


print("TF-IDF matrix shape: {}".format(tfidf_matrix.shape))

Wall time: 49.9 ms
TF-IDF matrix shape: (140, 13)


  'stop_words.' % sorted(inconsistent))


### Extracted keywords:

In [14]:
feature_array = np.array(tfidf_vectorizer.get_feature_names())
tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]

n = 5
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['peopl' 'white' 'terror' 'man' 'look']


# Following part is commented out - not used and tested

This were some initial experiments that are not included anywhere in the report.

#### Use cosine similarity

In [None]:
#terms = tfidf_vectorizer.get_feature_names()

#dist = 1 - cosine_similarity(tfidf_matrix)
#print(dist)

#### Use k-means clustering
##### First choose optimal number of clusters using the silhouette score

In [None]:
"""
range_clusters = list(range(2,10))
for num_clusters in range_clusters:
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    silhouette_avg = silhouette_score(tfidf_matrix, clusters)
    print(f"{num_clusters} clusters - silhouette: {silhouette_avg}")
"""

In [None]:
"""
num_clusters = 3
km = KMeans(n_clusters=num_clusters)

# Perform clustering
%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()
print("Clusters: {}".format(clusters))

print(len(clusters))
"""

#### Get number of tweets per cluster

In [None]:
"""
tweets = {"label": labels, "text": texts, "cluster": clusters}
frame = pd.DataFrame(tweets, index = [clusters] , columns = ["label", "text", "cluster"])

print("Number of tweets per cluster: \n{}".format(frame["cluster"].value_counts()))
"""

In [None]:
"""
print("Top terms per cluster:\n")

# Sort cluster centers by proximity to centroid.
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
print(order_centroids)

# Helper function
def getClusterWords(cluster, n=10):
    words = []
    for ind in order_centroids[cluster, :n]: # Print 6 words per cluster
        words.append(vocab_frame.loc[terms[ind].split(' '),].values.tolist()[0][0])
    return ", ".join(words)

for i in range(num_clusters):
    print("Cluster {} words: {}".format(i, getClusterWords(i)))
    
    print("Cluster {} labels:".format(i), end='')
    
    for label in frame[frame["cluster"]==i]["label"].values.tolist():
        print(" {},".format(label), end='')
    print("\n")
"""

#### Plotting

In [None]:
"""
from sklearn.manifold import MDS

# Parameter "precomputed" because we provide a distance matrix.
# Parameter "random_state" so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

# Shape of the result will be (n_components, n_samples).
pos = mds.fit_transform(dist)  

xs, ys = pos[:, 0], pos[:, 1]
"""

In [None]:
"""
# Define colors for clusters.
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

# Define cluster names
cluster_names = dict([(i, getClusterWords(i, 3)) for i in range(5)])
"""

In [None]:
"""
import matplotlib.pyplot as plt
import matplotlib as mpl
# Enable to draw plot inline.
%matplotlib inline

# Create a data frame that has the result of the MDS plus the cluster numbers and titles.
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 

# Group by cluster.
groups = df.groupby('label')


# Set up plot.
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

# Iterate through groups to layer the plot.
# Note that we use the cluster_name and cluster_color dicts with the 'name' 
# lookup to return the appropriate color/label.
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',         # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',        # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

# Add label in x,y position with the label as the film title.
for i in range(len(df)):
    # old pandas:
    #ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8) 
    ax.text(df.loc[df.index[i], 'x'], df.loc[df.index[i], 'y'], df.loc[df.index[i], 'title'], size=8)  

# Uncomment the below to show or save the plot.
plt.show()                                       #show the plot
#plt.savefig('clusters_small_noaxes.png', dpi=200) # save the plot as an image 

plt.close()
"""