In [88]:
import numpy as np
import pandas as pd
import nltk
import os
import mpld3
from nltk.stem.snowball import SnowballStemmer
from sklearn import feature_extraction
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
data_dir = '../data/azsecure'
model_dir = '../models/'
data = {
    'ansar1': {'path': os.path.join(data_dir, 'Ansar1.txt.gz'), 'delimiter': '\t'},
    'hackhound': {'path': os.path.join(data_dir, 'Hackhound.csv.gz'), 'delimiter': ','},
    'islamic_awakening': {'path': os.path.join(data_dir, 'IslamicAwakening.txt.gz'), 'delimiter': '\t'},
    'islamic_network': {'path': os.path.join(data_dir, 'IslamicNetwork.txt.gz'), 'delimiter': '\t'},
}
sites = sorted(data.keys())

# Functions

In [101]:
def get_dataframe(site):
    """
    Return Pandas Dataframe from site tsv/csv.
    """
    df = pd.read_csv(data[site]['path'], header=0, sep=data[site]['delimiter'],
                     error_bad_lines=False, warn_bad_lines=True)
    return df

def get_text_blob(df):
    """
    Concatenate all text in df 'Message' column into a blob of text.
    """
    blob = ''
    for index, row in df['Message'].iteritems():
        blob += '\n{}'.format(row)
    return blob 

def tokenize_and_stem(text):
    tokens = nltk.wordpunct_tokenize(text)
    clean_tokens = [w.lower() for w in tokens if w.isalpha()]
    stems = [stemmer.stem(token) for token in clean_tokens]
    return stems

def tokenize(text):
    tokens = nltk.wordpunct_tokenize(text)
    clean_tokens = [w.lower() for w in tokens if w.isalpha()]
    return clean_tokens

def save_model(model, name):
    """
    Pickle model in models directory using joblib.
    """
    filepath = os.path.join(model_dir, name)
    joblib.dump(model, filepath)
    
def load_model(name):
    """
    Get pickled model `name` from model directory
    """
    filepath = os.path.join(model_dir, name)
    return joblib.load(filepath)
        

# Build Corpus Vocabulary

In [72]:
# Aim here is index of stems-words used across all the sites.

total_words = []
total_stems = []
for site in sites:
    df = get_dataframe(site)
    print('[*] Built dataframe for {}'.format(site))
    text = get_text_blob(df)
    data[site]['text'] = text
    print('[*] Built text blob for {}'.format(site))
    stems = tokenize_and_stem(text)
    print('[*] Built word stems for {}'.format(site))
    words = tokenize(text)
    print('[*] Built word tokens for {}'.format(site))
    total_stems.extend(stems)
    total_words.extend(words)
print('Done')

b'Skipping line 29064: expected 11 fields, saw 12\nSkipping line 29065: expected 11 fields, saw 12\nSkipping line 29071: expected 11 fields, saw 12\nSkipping line 29072: expected 11 fields, saw 12\nSkipping line 29078: expected 11 fields, saw 12\nSkipping line 29338: expected 11 fields, saw 12\nSkipping line 29428: expected 11 fields, saw 12\nSkipping line 29436: expected 11 fields, saw 12\nSkipping line 29438: expected 11 fields, saw 12\n'


[*] Built dataframe for ansar1
[*] Built text blob for ansar1
[*] Built word stems for ansar1
[*] Built word tokens for ansar1
[*] Built dataframe for hackhound
[*] Built text blob for hackhound
[*] Built word stems for hackhound
[*] Built word tokens for hackhound


b'Skipping line 95059: expected 11 fields, saw 14\n'
b'Skipping line 138730: expected 11 fields, saw 14\nSkipping line 184781: expected 11 fields, saw 14\n'
  if self.run_code(code, result):


[*] Built dataframe for islamic_awakening
[*] Built text blob for islamic_awakening
[*] Built word stems for islamic_awakening
[*] Built word tokens for islamic_awakening
[*] Built dataframe for islamic_network
[*] Built text blob for islamic_network
[*] Built word stems for islamic_network
[*] Built word tokens for islamic_network
Done


In [73]:
vocab_df = pd.DataFrame({'words': total_words}, index = total_stems)
print('There are {} words in the vocabulary used across all sites.'.format(vocab_df.shape[0]))

There are 51235079 words in the vocabulary used across all sites.


# Site Similarity

In [74]:
site_texts = [data[site]['text'] for site in sites]
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(site_texts) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 13min 11s, sys: 12 s, total: 13min 23s
Wall time: 13min 48s
(4, 200000)


In [106]:
terms = tfidf_vectorizer.get_feature_names()

In [75]:
dist = 1 - cosine_similarity(tfidf_matrix)

In [76]:
dist

array([[  2.22044605e-15,   9.99899296e-01,   6.02882248e-01,
          6.77900459e-01],
       [  9.99899296e-01,   0.00000000e+00,   9.99761427e-01,
          9.99601286e-01],
       [  6.02882248e-01,   9.99761427e-01,  -2.22044605e-15,
          1.22180109e-01],
       [  6.77900459e-01,   9.99601286e-01,   1.22180109e-01,
         -1.11022302e-15]])

# Clustering

In [86]:
num_clusters = 2

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 4.05 s, sys: 16 ms, total: 4.06 s
Wall time: 4.11 s


In [87]:
clusters

[1, 0, 1, 1]

In [97]:
save_model(km, 'azsecure.pkl')

In [102]:
km = load_model('azsecure.pkl')

In [103]:
clusters = km.labels_.tolist()

In [104]:
clusters

[1, 0, 1, 1]

In [109]:
site_matrix = {'site': sites, 'text': site_texts, 'cluster': clusters}
sites_df = pd.DataFrame(site_matrix, index=[clusters])


In [110]:
sites_df['cluster'].value_counts()

1    3
0    1
Name: cluster, dtype: int64

In [119]:
for i in range(num_clusters):
    print('Cluster {}:'.format(i))
    print()
    print(sites_df.loc[sites_df['cluster'] == i]['site'].values.tolist())
    print()
    

Cluster 0:

['hackhound']

Cluster 1:

['ansar1', 'islamic_awakening', 'islamic_network']



In [120]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        #print(' %s' % vocab_df.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        print(' %s' % vocab_df.ix[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace
    
    
print()
print()

Top terms per cluster:

Cluster 0 words: ff, ff, eax, dword, mov, ptr,

Cluster 1 words: allah, muslim, islamic, م, في, wa,



