In [1]:
import pandas as pd
import csv
import numpy as np
import re
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
import sys
from nltk.stem.porter import PorterStemmer
import math
import hdbscan
import warnings
warnings.filterwarnings('ignore')
sys.setrecursionlimit(100000)

# Cluster

In [2]:
videos = []
with open("sentences_en.tsv", 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        videos.append([row[0], row[1], row[2], row[3]])
        
videos_en_new = pd.DataFrame(videos, columns=['id', 'description', 'channelTitle', 'sentence'])
del videos

In [3]:
print(videos_en_new.shape)
print(videos_en_new.loc[0])

(155345, 4)
id                                                    JnJCH-sPG3w
description     I OWN NOTHING!!! \n\nSong: Paramore "The Only ...
channelTitle                                        Vampire Tears
sentence                                          I OWN NOTHING!!
Name: 0, dtype: object


In [4]:
videos_en_new = videos_en_new[:100000]

In [5]:
stemmer = PorterStemmer()

def tokenize(line):
    if (line is None):
        line = ''
    printable = set(string.printable)
    line = ''.join(filter(lambda x: x in printable, line)) 
    stopwords = nltk.corpus.stopwords.words('english')

    tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]*\'[a-zA-Z]*|\w+')
    
    tokens = []
    
    line = re.sub(r'(http[s]?://|www.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]*|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))*', '', line).lower()
    tokens.extend(tokenizer.tokenize(line))
    
    tokens_ = [f.strip(string.punctuation) for f in tokens]
    tokens_ = [f for f in tokens_ if f != '' and f not in stopwords and len(f) != 1]
    tokens_ = [f for f in tokens_ if not (f.isdigit() or f[0] == '-' and f[1:].isdigit())]
    tokens_ = [stemmer.stem(f) for f in tokens_]

    return tokens_


In [7]:
countVec = CountVectorizer(tokenizer=tokenize).fit(videos_en_new['sentence'])
#try with binary=True as well

In [8]:
lineVec = countVec.transform(videos_en_new['sentence'])

In [9]:
print(lineVec.shape)

(100000, 50550)


In [10]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=5)

In [11]:
clusterer.fit(lineVec)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='eom',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='euclidean', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [12]:
videos_en_new["cluster"] = clusterer.labels_

In [19]:
with open('sentences_clustered.tsv', 'wt') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    count = 0
    for index, row in videos_en_new.iterrows():
        writer.writerow([
            row["id"].encode('utf-8'), 
            row["description"].encode('utf-8'), 
            row["channelTitle"].encode('utf-8'), 
            row["sentence"].encode('utf-8'), 
            row.cluster,
        ])
    print(count)

0
