In [368]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer, \
cosine_distance
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import gensim
from gensim import corpora

In [369]:
# Q1
def cluster_kmean(train_file, test_file):
    # add your code here
    train=pd.read_json(train_file)
    test=pd.read_json(test_file)
    test_label = [test[1][x][0] for x in range(len(test))]

    tfidf_vect = TfidfVectorizer(stop_words="english",\
                             min_df=5) 

    dtm= tfidf_vect.fit_transform(train[0])
    
    num_clusters=3
    
    #### COSINE ####

    clusterer = KMeansClusterer(num_clusters, \
                        cosine_distance, \
                            repeats=25)

    clusters_cosine = clusterer.cluster(dtm.toarray(), \
                             assign_clusters=True)
    test_dtm = tfidf_vect.transform(test[0])

    predicted_cosine = [clusterer.classify(v) for v in test_dtm.toarray()]    
    
    confusion_df_cs = pd.DataFrame(list(zip(test_label, predicted_cosine)),\
                            columns = ["actual_class", "cluster"])

    crosstab_cs = pd.crosstab( index=confusion_df_cs.cluster, columns=confusion_df_cs.actual_class)
    cs_idx = crosstab_cs.idxmax(axis = 0)
    cs_idx = cs_idx.sort_values(ascending=True) 
    
    cluster_dict={0:'Travel & Transportation',\
              1:"Disaster and Accident",\
              2:'News and Economy'}
    predicted_target=[cluster_dict[i] \
                  for i in predicted_cosine]
    
    print("cosine")
    print(crosstab_cs)
    for index, val in cs_idx.iteritems():
        print("Cluster",val,": Topic",index)  
    print(metrics.classification_report\
      (test_label, predicted_target))


    ###### EUCLIDEAN ######
    
    km = KMeans(n_clusters=num_clusters, n_init=25).fit(dtm)
    clusters = km.labels_.tolist()
    
    test_dtm = tfidf_vect.transform(test[0])

    predicted_euclidean = km.predict(test_dtm)
    
    confusion_df_eu = pd.DataFrame(list(zip(test_label, predicted_euclidean)),\
                            columns = ["actual_class", "cluster"])
    crosstab_eu = pd.crosstab( index=confusion_df_eu.cluster, columns=confusion_df_eu.actual_class)
    eu_idx = crosstab_eu.idxmax(axis = 0)
    
    eu_idx = eu_idx.sort_values(ascending=True)
    cluster_dict={1:'Travel & Transportation',\
              0:"Disaster and Accident",\
              2:'News and Economy'}
    predicted_target=[cluster_dict[i] \
                  for i in predicted_euclidean]

    print("L2")
    print(crosstab_eu)
    for index, val in eu_idx.iteritems():
        print("Cluster",val,": Topic",index)
    print(metrics.classification_report\
      (test_label, predicted_target))


    return None

# Q1
print("Q1")
cluster_kmean('/Users/parthxparab/Documents/Fall 2019/BIA660/Lab5/Assignment/train_text.json', \
             '/Users/parthxparab/Documents/Fall 2019/BIA660/Lab5/Assignment/test_text.json')

Q1
cosine
actual_class  Disaster and Accident  News and Economy  Travel & Transportation
cluster                                                                       
0                                71                 5                      157
1                               129                 6                       18
2                                10               195                        9
Cluster 0 : Topic Travel & Transportation
Cluster 1 : Topic Disaster and Accident
Cluster 2 : Topic News and Economy
                         precision    recall  f1-score   support

  Disaster and Accident       0.84      0.61      0.71       210
       News and Economy       0.91      0.95      0.93       206
Travel & Transportation       0.67      0.85      0.75       184

               accuracy                           0.80       600
              macro avg       0.81      0.80      0.80       600
           weighted avg       0.81      0.80      0.80       600

L2
actual_class  Dis

In [382]:
# Q2
def cluster_lda(train_file, test_file):
    
    train=pd.read_json(train_file)
    test=pd.read_json(test_file)
    test_label = [test[1][x][0] for x in range(len(test))]
    topic_assign = None

    tf_vectorizer = CountVectorizer(max_df=0.90, \
                min_df=5, stop_words='english')
    tf = tf_vectorizer.fit_transform(train[0])

    tf_feature_names = tf_vectorizer.get_feature_names()
    
    
    num_topics = 3
#    lda = LatentDirichletAllocation(n_components=num_topics, \
#                                max_iter=25,verbose=1,
#                                evaluate_every=1, n_jobs=1,
#                                random_state=0).fit(tf)
#    topic_assign=lda.transform(tf)

            
    corpus = gensim.matutils.Sparse2Corpus(tf, \
                            documents_columns=False)

    id2word={idx:w for idx, w in \
    enumerate(tf_vectorizer.get_feature_names())}
    
    dictionary = corpora.Dictionary.from_corpus(corpus, \
                id2word=id2word)
    ldamodel = gensim.models.\
    ldamodel.LdaModel(corpus, alpha='asymmetric',\
                            num_topics = num_topics, \
                            id2word=id2word, \
                            iterations=20)
    tf_test = tf_vectorizer.fit_transform(test[0])

    
    test_corpus = gensim.matutils.Sparse2Corpus(tf_test, \
                    documents_columns=False)
    predict = ldamodel.get_document_topics(test_corpus)
    pred_val = []
    for i in range(len(list(predict))):
        val = max(list(predict)[i],key = lambda item:item[1])
        pred_val.append(val[0])
    print(pred_val)
    
    
    ########## PRINT ############
    
    confusion_df_cs = pd.DataFrame(list(zip(test_label, pred_val)),\
                            columns = ["actual_class", "cluster"])
    crosstab_cs = pd.crosstab( index=confusion_df_cs.cluster, columns=confusion_df_cs.actual_class)
    cs_idx = crosstab_cs.idxmax(axis = 0)
    cs_idx = cs_idx.sort_values(ascending=True) 
    
    cluster_dict={0:'Travel & Transportation',\
              1:"Disaster and Accident",\
              2:'News and Economy'}
    predicted_target=[cluster_dict[i] \
                  for i in pred_val]
    print(crosstab_cs)
    for index, val in cs_idx.iteritems():
        print("Cluster",val,": Topic",index)  
    print(metrics.classification_report\
      (test_label, predicted_target))

def find_similar(doc_id, topic_assign):
    
    docs = None
        
    return docs
cluster_lda('/Users/parthxparab/Documents/Fall 2019/BIA660/Lab5/Assignment/train_text.json', \
             '/Users/parthxparab/Documents/Fall 2019/BIA660/Lab5/Assignment/test_text.json')

[2, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 0, 2, 1, 2, 2, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, 1, 1, 2, 0, 2, 1, 1, 2, 0, 1, 1, 0, 1, 0, 2, 0, 1, 2, 0, 1, 2, 1, 0, 1, 1, 0, 1, 0, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 0, 1, 1, 2, 0, 0, 2, 1, 1, 2, 1, 2, 0, 1, 1, 0, 0, 1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 0, 1, 1, 1, 2, 1, 1, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 2, 0, 2, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 1, 0, 2, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 2, 1, 1, 1, 0, 1, 0, 0, 1, 0, 2, 2, 1, 1, 1, 1, 0, 2, 0, 1, 0, 2, 1, 2, 0, 1, 0, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 2, 1, 0, 0, 2, 2, 1, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 1, 1, 0, 0, 2, 2, 1, 1, 0, 2, 1, 2, 2, 2, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 0, 1, 2, 1, 1, 0, 2, 1, 1, 2, 1, 0, 2, 2, 1, 1, 0, 2, 0, 1, 2, 2, 1, 1, 1, 0, 1, 2, 1, 2, 2, 0, 1, 1, 0, 2, 2, 0, 2, 1, 2, 0, 0, 2, 0, 2, 1, 1, 1, 0, 2, 0, 1, 2, 0, 1, 1, 2, 1, 0, 0, 1, 2, 2, 1, 0, 


print("\nQ2")
topic_assign =cluster_lda('/Users/parthxparab/Documents/Fall 2019/BIA660/Lab5/Assignment/train_text.json', \
             '/Users/parthxparab/Documents/Fall 2019/BIA660/Lab5/Assignment/test_text.json')
doc_ids = find_similar(10, topic_assign)
print ("docs similar to {0}: {1}".format(10, doc_ids))