In [1]:
import time
start_time = time.time()

In [2]:
import pandas as pd
import os
from bidi.algorithm import get_display
import re
import multiprocessing
import numpy as np

In [3]:
from scipy import sparse

In [4]:
import matplotlib.pyplot as plt

In [5]:
import pickle

In [6]:
file_type = 'thread'
field_name = 'title'

In [7]:
pickle_file = open(f'{file_type}_{field_name}_token_lists.pickle', "rb" )
corpus = pickle.load( pickle_file )
pickle_file.close()

In [8]:
corpus_temp = list()
for sentence in corpus:
    sentence_temp = list()
    for word in sentence:
        try:
            if len(word) > 1:
                sentence_temp.append(word)
        except:
            print(word)
    corpus_temp.append(' '.join(reversed(sentence_temp)))
corpus = corpus_temp

In [9]:
tf_idf_size = 30

In [10]:
##Creating a list of stop words and adding custom stopwords
import codecs
with codecs.open('stopwords.txt', 'r', encoding='utf8') as f:
    text = f.read()
stop_words = set(word for word in text.split('\r\n'))

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=tf_idf_size)
X=cv.fit_transform(corpus)

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted
tf_idf_vectors = tfidf_transformer.transform(X).todense()

In [13]:
tf_idf_vectors.shape

(23163, 30)

In [14]:
from sklearn.cluster import DBSCAN

In [15]:
db = DBSCAN(eps=0.75, min_samples=10, n_jobs=-1).fit(tf_idf_vectors)

In [16]:
labels = db.labels_

In [17]:
print(f'number of clusters are {len(set(labels)) - (1 if -1 in labels else 0)}')

number of clusters are 2


In [18]:
print(f'number of outliers {list(labels).count(-1)}')

number of outliers 38


In [19]:
grouped_labels = dict()
for i in range(len(labels)):
    label = labels[i]
    if not label in grouped_labels:
        grouped_labels[label] = list()
    grouped_labels[label].append(tf_idf_vectors[i,])

In [20]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results


In [21]:
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [22]:
for label in grouped_labels:
    print(f'cluster label {label}')
    print(f'number of documents {len(grouped_labels[label])}')
    avg_vector = sparse.csr_matrix(np.average(grouped_labels[label],axis=0))
    print(extract_topn_from_vector(feature_names,sort_coo(avg_vector.tocoo())))

cluster label 0
number of documents 13170
{}
cluster label 1
number of documents 9955
{'גיוס': 0.095, 'פרופיל': 0.068, 'חיל': 0.057, 'תפקיד': 0.055, 'גיבוש': 0.051, 'מיון': 0.05, 'קורס': 0.049, 'פטור': 0.042, 'טירונות': 0.041, 'עזרה': 0.04}
cluster label -1
number of documents 38
{'קורס': 0.204, 'קרבי': 0.187, 'צו': 0.113, 'טופס': 0.112, 'לוחם': 0.111, 'מיונים': 0.093, 'יחידה': 0.087, 'דחוף': 0.074, 'מתגייס': 0.071, 'תפקידים': 0.056}


In [25]:
for document in grouped_labels[0]:
    print(document)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [23]:
from sklearn.metrics import silhouette_score
silhouette_score(X, db.labels_, sample_size=1000)

-0.10538584421053629

In [24]:
def get_values_of_silhouette_from_vec_size_and_eps(vec_size_values, eps_values,stop_words):
    silhouette_score_df = pd.DataFrame(columns=['vec_size','eps','nclusters','noutliers','silhouette_score'])
    for vec_size in vec_size_values:
        cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=vec_size)
        X=cv.fit_transform(corpus)

        tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
        tfidf_transformer.fit(X)
        # get feature names
        feature_names=cv.get_feature_names()

        # fetch document for which keywords needs to be extracted
        tf_idf_vectors = tfidf_transformer.transform(X).todense()
        for eps in eps_values:
            db = DBSCAN(eps=eps, min_samples=10, n_jobs=-1).fit(tf_idf_vectors)
            labels = db.labels_
            nclusters = len(set(labels)) - (1 if -1 in labels else 0)
            noutliers = list(labels).count(-1)
            try:
                score = silhouette_score(X, db.labels_)
            except:
                score  = -1
            new_row_df = pd.DataFrame([[vec_size,eps,nclusters,noutliers,score]],columns=silhouette_score_df.columns)
            silhouette_score_df = silhouette_score_df.append(new_row_df , ignore_index=True)
            print('finish to calc vec_size=%d and eps=%f nclusters=%d noutliers=%d score=%f'%(vec_size,eps,nclusters,noutliers,score), flush=True)
    return silhouette_score_df

In [None]:
pd_scores = get_values_of_silhouette_from_vec_size_and_eps([5,10,15,25,30,50,100,200,500,1000,1500,2000], [0.1,0.2,0.5,0.75,1,1.5,2,3],stop_words)
pd_scores.to_csv('cluster_preformace.csv')

finish to calc vec_size=5 and eps=0.100000 nclusters=12 noutliers=23 score=0.994078
finish to calc vec_size=5 and eps=0.200000 nclusters=12 noutliers=23 score=0.994078
finish to calc vec_size=5 and eps=0.500000 nclusters=9 noutliers=20 score=0.988142
finish to calc vec_size=5 and eps=0.750000 nclusters=6 noutliers=4 score=0.982976
finish to calc vec_size=5 and eps=1.000000 nclusters=1 noutliers=0 score=-1.000000
finish to calc vec_size=5 and eps=1.500000 nclusters=1 noutliers=0 score=-1.000000
finish to calc vec_size=5 and eps=2.000000 nclusters=1 noutliers=0 score=-1.000000
finish to calc vec_size=5 and eps=3.000000 nclusters=1 noutliers=0 score=-1.000000
finish to calc vec_size=10 and eps=0.100000 nclusters=29 noutliers=166 score=0.982697
finish to calc vec_size=10 and eps=0.200000 nclusters=29 noutliers=166 score=0.982697
finish to calc vec_size=10 and eps=0.500000 nclusters=23 noutliers=140 score=0.971263
finish to calc vec_size=10 and eps=0.750000 nclusters=3 noutliers=29 score=0.

In [None]:
pd_scores

In [None]:
def send_email(user='dsakaidf@gmail.com', pwd='d54k4idf!', recipient='shkasta@post.bgu.ac.il',
               subject='finish expirement', body='finish the expirement'):
    import smtplib

    gmail_user = user
    gmail_pwd = pwd
    FROM = user
    TO = recipient if type(recipient) is list else [recipient]
    SUBJECT = subject
    TEXT = body

    # Prepare actual message
    message = """From: %s\nTo: %s\nSubject: %s\n\n%s
    """ % (FROM, ", ".join(TO), SUBJECT, TEXT)
    try:
        # SMTP_SSL Example
        server_ssl = smtplib.SMTP_SSL("smtp.gmail.com", 465)
        server_ssl.ehlo()  # optional, called by login()
        server_ssl.login(gmail_user, gmail_pwd)
        # ssl server doesn't support or need tls, so don't call server_ssl.starttls()
        server_ssl.sendmail(FROM, TO, message)
        # server_ssl.quit()
        server_ssl.close()
        print('successfully sent the mail')
    except:
        print("failed to send mail")


In [None]:
send_email()

In [None]:
elapsed_time = time.time() - start_time
print(elapsed_time)