In [1]:
# Importing the required libraries.
import numpy as np
import pandas as pd
import pickle, zlib
from random import sample
import scipy.cluster.hierarchy as sch
from gensim.models.doc2vec import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans 
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import single, fcluster
from scipy.spatial.distance import pdist
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import single, cophenet
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.neighbors import NearestCentroid
from collections import Counter
import sys
import math
sys.setrecursionlimit(5000)

In [2]:
#Downsampling
def sampling(data):
    limit = 10000
    ind = []
    if len(data)>limit:
        ind = sample([i for i in range(len(data))],limit)
        data = [data[i] for i in ind]
    else:
        ind = [i for i in range(len(data))]
    return data,ind

In [3]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [4]:
#choose method and metric
def choose_metric(data):
    method = ['single','complete','average','weighted','centroid','ward']
    metric = ['euclidean','cosine']
    maxx = None
    use_method = None
    use_metric = None
    '''for m1 in method:
        for m2 in metric:
            try:
                linkage_matrix = sch.linkage(data, m1, metric=m2)
                c, coph_dists = cophenet(linkage_matrix, pdist(data))
                print("Method=",m1," Metric=",m2," Cophenet coeff=",c)
                plot_dendro(linkage_matrix)
                if maxx is None or c>maxx:
                    maxx = c
                    use_method = m1
                    use_metric = m2
                print(m1,m2,c)
            except Exception as e:
                print(e)'''
    use_method ='ward' #input('Input the suitable method: ')
    use_metric = 'euclidean' #input('Input the suitable metric: ')
    return use_method,use_metric

In [5]:
#Plot dendrograms and we are interested in huge jump in distance 
def plot_dendro(linkage_matrix):
    for threshold in np.arange(10,110,10):
        labels = sch.fcluster(linkage_matrix, threshold, criterion='distance')
        fancy_dendrogram(
            linkage_matrix,
            truncate_mode='lastp',
            p=30,
            leaf_rotation=90.,
            leaf_font_size=12.,
            show_contracted=True,
            annotate_above=10,  # useful in small plots so annotations don't overlap
            max_d=threshold
            )
        plt.show()
        break

In [31]:
#Eliminate outlier clusters
def eliminate_outlier(data,linkage_matrix,method,metric,thresh=50):
    labels = sch.fcluster(linkage_matrix, thresh, criterion='distance')
    d = Counter(labels)
    sorted_d = sorted(d.items(), key = lambda kv:(kv[1], kv[0])) 
    maxx = None
    art_thresh = None
    '''val = 1
    ind = 0
    flag = False
    while maxx is None or flag:
        flag = False
        val = sorted_d[ind][1]
        clusters = [i for i,j in sorted_d if j==val]
        ind = sorted_d.index((clusters[-1],val))+1
        indices = []
        for c in clusters:
            indices = indices+[i for i, x in enumerate(labels) if x == c]
        revised_data = [data[i] for i in range(len(data)) if i not in indices]
        linkage_matrix = sch.linkage(revised_data, method, metric=metric)
        c, coph_dists = cophenet(linkage_matrix, pdist(revised_data))
        print(val,c)
        if maxx is None or c>maxx:
            maxx = c
            flag = True
    return val
    '''
    itm = list(d.values())
    for art in range(1,101):
        if art not in itm:
            continue
        clusters = []
        for i,j in sorted_d:
            if j<art:
                clusters.append(i)
        indices = []
        for c in clusters:
            indices = indices+[i for i, x in enumerate(labels) if x == c]
        revised_data = [data[i] for i in range(len(data)) if i not in indices]
        print(len(revised_data),len(indices),len(data))
        linkage_matrix = sch.linkage(revised_data, method, metric=metric)
        c, coph_dists = cophenet(linkage_matrix, pdist(revised_data))
        
        if maxx is None or c>=maxx:
            maxx = c
            art_thresh = art
    print(art_thresh,c,maxx)
    return art_thresh

In [53]:
#Compute rank
def compute_scr(linkage_matrix,data,art_thresh,thresh=50):
    labels = sch.fcluster(linkage_matrix, thresh, criterion='distance')
    d = Counter(labels)
    sorted_d = sorted(d.items(), key = lambda kv:(kv[1], kv[0])) 
    clusters = []
    for i,j in sorted_d:
        if j<=art_thresh:
            clusters.append(i)
    indices = []
    for c in clusters:
        indices = indices+[i for i, x in enumerate(labels) if x == c]
    ind = [i for i in range(len(data)) if i not in indices]
    revised_data = [data[i] for i in ind]
    labels=list(labels)
    labels = [labels[i] for i in ind]
    print(len(ind),len(revised_data),len(data))
    if len(set(labels))==1:
        centroids = [np.mean(revised_data,axis=0)]
    else:
        clf = NearestCentroid()
        clf.fit(revised_data, labels) 
        centroids = clf.centroids_
    print(len(ind),len(revised_data),len(data),len(centroids))
    num1 = [i for i in range(len(set(labels)))]
    num2 = sorted(set(labels))
    for i in range(len(num1)):
        labels=[num1[i] if x==num2[i] else x for x in labels]
    ranks = []
    for i in range(len(labels)):
        ranks.append((cosine_similarity(centroids[labels[i]].reshape(1,-1),revised_data[i].reshape(1,-1))[0][0]+1)/2) #cos_sim(row1, row2)- minx)/(maxx-minx)
    return revised_data,labels,centroids,ranks,ind

In [47]:
#driver function
def driver_func(data1):
    data,indx = sampling(data1)
    method,metric= choose_metric(data)
    linkage_matrix = sch.linkage(data, method, metric=metric)
    plot_dendro(linkage_matrix)
    #thresh = 50
    thresh = 100#input('Input distance threshold value based on dendrogram: ')
    art_thresh = 0#eliminate_outlier(data,linkage_matrix,method,metric,thresh)
    return indx,compute_scr(linkage_matrix,data,art_thresh,thresh)
    

In [48]:
#global centroid
def global_centroid(data):
    return np.median(np.array(data),axis=0)

In [49]:
#rank B,C
def get_rank(data,global_centroid):
    ranks = []
    for d in data:
        ranks.append((cosine_similarity(global_centroid.reshape(1,-1),d.reshape(1,-1))[0][0]+1)/2) #cos_sim(row1, row2)- minx)/(maxx-minx)
    return ranks
    

In [50]:
#get filtered
def get_filtered(data,indx,indA,art):
    data_= [data[a] for a in range(len(data)) if a in indx]
    data_= [data_[a] for a in range(len(data_)) if a in indA]
    data_ = [data_[a] for a in art]
    return data_

In [55]:
#get pattern result
def get_pat_res(data1,data2):
    indx1,out1= driver_func(data1)
    (revised_data1,labels1,centroids1,rankA1,indA1) = out1
    indx2,out2= driver_func(data2)
    (revised_data2,labels2,centroids2,rankA2,indA2) = out2
    gc1 = global_centroid(revised_data1)
    gc2 = global_centroid(revised_data2)
    rankB1 = get_rank(revised_data1, gc1)
    rankB2 = get_rank(revised_data2, gc2)
    rankC1 = get_rank(revised_data1, gc2)
    rankC2 = get_rank(revised_data2, gc1)
    scores1=[]
    for i,j,k in zip(rankA1,rankB1,rankC1):
        scores1.append(i*j-k)
    scores2=[]
    for i,j,k in zip(rankA2,rankB2,rankC2):
        scores2.append(i*j-k)
    cluster1 = {}
    for i in range(len(labels1)):
        if labels1[i] in cluster1.keys():
            cluster1[labels1[i]][0].append(i)
            cluster1[labels1[i]][1].append(scores1[i])
        else:
            cluster1[labels1[i]] = [[i],[scores1[i]]]
    cluster2 = {}
    for i in range(len(labels2)):
        if labels2[i] in cluster2.keys():
            cluster2[labels2[i]][0].append(i)
            cluster2[labels2[i]][1].append(scores2[i])
        else:
            cluster2[labels2[i]] = [[i],[scores2[i]]]
    num1=int(100/len(centroids1))+1
    articles1 = []
    for c in cluster1:
        sort_ind = np.argsort(np.array(cluster1[c][1]))
        indx = [cluster1[c][0][i] for i in sort_ind[-num1:]]
        articles1+=indx
    
    num2=int(100/len(centroids2))+1
    articles2 = []
    for c in cluster2:
        sort_ind = np.argsort(np.array(cluster2[c][1]))
        indx = [cluster2[c][0][i] for i in sort_ind[-num2:]]
        articles2+=indx
    return sample(articles1,min(len(articles1),100)),sample(articles2,min(len(articles2),100)),indA1,indA2,indx1,indx2

In [4]:
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

for dataset, model in zip(datasets,models):

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('Datasets/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    model = Doc2Vec.load('Models/'+model)

    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [[] for _ in range(3)]
    temp_vectors = [[] for _ in range(3)]
    temp_datasets = [[] for _ in range(3)]
    temp_titles = [[] for _ in range(3)]
    for i in dataset:
        if i[-5]==619: # thiruvarur 16-17 Unemp
            if i[0] not in temp_ids[0]:
                temp_ids[0].append(i[0])
                temp_titles[0].append(i[1])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        if i[-5]==620: #thanjavur 16-20 Unemp
            if i[0] not in temp_ids[1]:
                temp_ids[1].append(i[0])
                temp_titles[1].append(i[1])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
    
    art1,art2,indA1,indA2,indx1,indx2 = get_pat_res(temp_vectors[0],temp_vectors[1])
    titles1 = get_filtered(temp_titles[0],indx1,indA1,art1)
    titles2 = get_filtered(temp_titles[1],indx2,indA2,art2)
    rndm = sample([a for a in range(len(titles1))],min(len(titles1),10))
    titles1 = [titles1[a] for a in rndm]
    rndm = sample([a for a in range(len(titles2))],min(len(titles2),10))
    titles2 = [titles2[a] for a in rndm]
    for k in titles1:
        print(k)
    print('\n\n')
    for k in titles2:
        print(k)
    print('\n\n')


Collection: Agriculture
agriculture0
Number of articles: 378 	Number of clusters: 11
Will monsoon turn saviour of samba?
Monsoon may be good but kuruvai prospects bleak
Grand Anicut to be opened today
Samba paddy cultivation picking up in Tiruvarur
Distressed over crop failure, another farmer in Tiruvarur dies
A dim Deepavali for delta farmers
Coconut growers seek remunerative support price
‘Use green manure for sustainable agriculture’
Heavy rain in Tiruvarur and Thanjavur
Intermittent showers drench central districts
Govt assures compensation for entire crop damage
Government caps coconut farmers’ relief, ‘clarifies’ after uproar
Delta districts get a respite from rain
Paddy inundated on large tracts of land
Over 5K held in delta as Cauvery stir spills on to streets
Central region gets seven berths in the Cabinet
CPI to launch rail rokos in delta districts on October 9
Delta farmer commits suicide
Crop failure claims five lives in the delta
Two farmers in Madurai, Tuticorin commit s

In [16]:
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

for dataset, model in zip(datasets,models):

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('Datasets/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    model = Doc2Vec.load('Models/'+model)

    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [[] for _ in range(3)]
    temp_vectors = [[] for _ in range(3)]
    temp_datasets = [[] for _ in range(3)]
    temp_titles = [[] for _ in range(3)]
    for i in dataset:
        if i[-4]==370: #Bargarh 10-9 Agri
            if i[0] not in temp_ids[0]:
                temp_ids[0].append(i[0])
                temp_titles[0].append(i[1])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        if i[-4]==372: #Sambalpur 10-15 Agri
            if i[0] not in temp_ids[1]:
                temp_ids[1].append(i[0])
                temp_titles[1].append(i[1])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
    art1,art2,indA1,indA2,indx1,indx2 = get_pat_res(temp_vectors[0],temp_vectors[1])
    titles1 = get_filtered(temp_titles[0],indx1,indA1,art1)
    titles2 = get_filtered(temp_titles[1],indx2,indA2,art2)
    rndm = sample([a for a in range(len(titles1))],min(len(titles1),10))
    titles1 = [titles1[a] for a in rndm]
    rndm = sample([a for a in range(len(titles2))],min(len(titles2),10))
    titles2 = [titles2[a] for a in rndm]
    for k in titles1:
        print(k)
    print('\n\n')
    for k in titles2:
        print(k)
    print('\n\n')


Collection: Agriculture
218
agriculture0
Number of articles: 218 	Number of clusters: 12
Continuous ruckus in Odisha Assembly over farmer suicide
Odisha BJP demands judicial probe into farm deaths
No government relief to farmers who burnt pest-infected paddy crop in Odhisha
Odisha government directs collectors of eight districts to submit report on crop loss
BJD on farmer appeasement drive
BJD to project pro-farmer image at Bargarh rally
Farmer ends life over crop loss
Day after setting fire to his crop, farmer commits suicide
Farmer suicide: One dies, another survives bid
Villagers force officials to reveal suicide findings
Crop loss cause for 2 more farm deaths
2 more farmers end lives in Bargarh due to crop loss
BJP steps up protest over pest attack
Pest Menace back to haunt Farmers in Sambalpur
Jumbo scare
Massive protests likely at Shivraj Singh Chauhan's public rally in Odisha on Saturday
Modi to address farmer rally at Bargarh on Feb. 21
Cong takes potshots at PM for silence on

Analyse outliers

In [16]:
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

for dataset, model in zip(datasets[3:4],models[3:4]):

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('Datasets/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    model = Doc2Vec.load('Models/'+model)

    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [[] for _ in range(3)]
    temp_vectors = [[] for _ in range(3)]
    temp_datasets = [[] for _ in range(3)]
    temp_titles = [[] for _ in range(3)]
    for i in dataset:
        if i[-5]==99: # Ganganagar 
            if i[0] not in temp_ids[0]:
                temp_ids[0].append(i[0])
                temp_titles[0].append(i[1])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        elif i[-4]=='Agri' and i[-1]=='Fast':
            if i[0] not in temp_ids[2]:
                temp_ids[1].append(i[0])
                temp_titles[1].append(i[1])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
        
    art1,art2,indA1,indA2,indx1,indx2 = get_pat_res(temp_vectors[0],temp_vectors[1])
    titles1 = get_filtered(temp_titles[0],indx1,indA1,art1)
    titles2 = get_filtered(temp_titles[1],indx2,indA2,art2)
    rndm = sample([a for a in range(len(titles1))],min(len(titles1),10))
    titles1 = [titles1[a] for a in rndm]
    rndm = sample([a for a in range(len(titles2))],min(len(titles2),10))
    titles2 = [titles2[a] for a in rndm]
    for k in titles1:
        print(k)
    print('\n\n')
    for k in titles2:
        print(k)
    print('\n\n')


Collection: Industrialization
industrialization0
Number of articles: 9 	Number of clusters: 8
I-T searches on Jaipur-based businessman's 30 properties
I-T raids on state BJP leader's premises
In Rajasthan’s biggest haul, I-T finds Rs 105cr undisclosed income
Industry upbeat on electronics zone, spl investment Act
Raje govt to go ahead with distillery in Ganganagar
Briefly Region; Haryana cop bags PMs medal for saving drowning drivers life
Small comfort from spectrum cash
Lack of agro-based supporting industry a major drawback
In Pics:&thinsp;US mother traces son's final journey after study trip to Uttarakhand led...



industrialization2
Number of articles: 1833 	Number of clusters: 14
New lease of life for water resources
A clean-up act, literally
Thakre reiterates opposition to power plants
Political egos affecting vidarbha's fortunes
Take Action against Mining Units without Clearances on Western Ghats, Centre Tells Maharashtra
Warrant against MoEFCC director
Businessmen pin hopes

In [63]:
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

for dataset, model in zip(datasets[0:1],models[0:1]):

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('../../Datasets/newADIdataset/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    model = Doc2Vec.load('../../Datasets/newADImodels/'+model)

    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [[] for _ in range(3)]
    temp_vectors = [[] for _ in range(3)]
    temp_datasets = [[] for _ in range(3)]
    temp_titles = [[] for _ in range(3)]
    for i in dataset:
        if i[-5]==65: # Champawat 
            if i[0] not in temp_ids[0]:
                temp_ids[0].append(i[0])
                temp_titles[0].append(i[1])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        elif i[6]=='Unemp' and i[-1]=='Fast':
            if i[0] not in temp_ids[2]:
                temp_ids[1].append(i[0])
                temp_titles[1].append(i[1])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
        
    art1,art2,indA1,indA2,indx1,indx2 = get_pat_res(temp_vectors[0],temp_vectors[1])
    titles1 = get_filtered(temp_titles[0],indx1,indA1,art1)
    titles2 = get_filtered(temp_titles[1],indx2,indA2,art2)
    rndm = sample([a for a in range(len(titles1))],min(len(titles1),10))
    titles1 = [titles1[a] for a in rndm]
    rndm = sample([a for a in range(len(titles2))],min(len(titles2),10))
    titles2 = [titles2[a] for a in rndm]
    for k in titles1:
        print(k)
    print('\n\n')
    for k in titles2:
        print(k)
    print('\n\n')


Collection: Agriculture
agriculture0
Number of articles: 4 	Number of clusters: 3
'Eye on Immense Potential', Uttarakhand Plans to Distil Badri Cow Urine, Sell It to Pharma Firms
'Eye on immense potential', Uttarakhand plans to distil Badri cow urine, sell it...
Rains brings death, loss to property and constant fear
Tyre tubes, not poll promises, keep Lakhimpur villagers afloat



agriculture2
Number of articles: 3671 	Number of clusters: 14
Met predicts more rain in coming days
Heavy rains predicted in coming days
Rain batters Mumbai, Thane, Pune and Nashik; swollen Godavari floods Andhra districts
Flood crisis deepens in Gujarat, Rajasthan; PM&thinsp;Modi inspects home state, gives relief...
Vegetable prices soar, milk supply hit as farmers’ protest for third day
Ahead of Rabi crop, farmers in Bulandshahr rue lack of cash
Farmer found murdered
Dacoits kill two villagers, escape with booty
Water released from Grand Anicut for irrigation in Cauvery delta
Chances of opening Mettur dam 

In [62]:
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

for dataset, model in zip(datasets[4:],models[4:]):

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('Datasets/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    model = Doc2Vec.load('Models/'+model)

    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [[] for _ in range(3)]
    temp_vectors = [[] for _ in range(3)]
    temp_datasets = [[] for _ in range(3)]
    temp_titles = [[] for _ in range(3)]
    for i in dataset:
        if i[-5]==136: #Rampur 
            if i[0] not in temp_ids[0]:
                temp_ids[0].append(i[0])
                temp_titles[0].append(i[1])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        elif i[6]=='Unemp' and i[-1]=='Slow':
            if i[0] not in temp_ids[2]:
                temp_ids[1].append(i[0])
                temp_titles[1].append(i[1])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
        
    art1,art2,indA1,indA2,indx1,indx2 = get_pat_res(temp_vectors[0],temp_vectors[1])
    titles1 = get_filtered(temp_titles[0],indx1,indA1,art1)
    titles2 = get_filtered(temp_titles[1],indx2,indA2,art2)
    rndm = sample([a for a in range(len(titles1))],min(len(titles1),10))
    titles1 = [titles1[a] for a in rndm]
    rndm = sample([a for a in range(len(titles2))],min(len(titles2),10))
    titles2 = [titles2[a] for a in rndm]
    for k in titles1:
        print(k)
    print('\n\n')
    for k in titles2:
        print(k)
    print('\n\n')


Collection: Lifestyle
lifestyle0
Number of articles: 28 	Number of clusters: 11
Egypt declares 3-month state of emergency after twin bombings
Blasts at Egypt's Coptic churches leave at least 44 dead, over 100 injured
Egypt parliament approves three-month state of emergency
Veil of insecurity shrouds Egypt
Egypt President Abdel-Fattah al-Sisi declares three-month emergency
23 Coptic Christians killed in Egypt attack
Attack in Cairo kills three policemen, injures five
Tension in Shahabad after desecration of portrait
One Killed in UP's Sambhal Dist in Mob Violence over Child-Lifting Rumours, Cops Initiate Action
The Rumour That Killed: Incidences of Mob Violence over Child-lifting Hoax on Rise in Uttar Pradesh
In Yogi raj, men should keep their women indoors: Azam Khan on Rampur molestati...
Govt push to update district chronicles
4 arrested for transporter's murder
Metamorphosis of a Village Girl into a Photography Icon



lifestyle2
Number of articles: 23695 	Number of clusters: 40
St

In [14]:
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

for dataset, model in zip(datasets[0:1],models[0:1]):

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('Datasets/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    model = Doc2Vec.load('Models/'+model)

    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [[] for _ in range(3)]
    temp_vectors = [[] for _ in range(3)]
    temp_datasets = [[] for _ in range(3)]
    temp_titles = [[] for _ in range(3)]
    for i in dataset:
        if i[-5]==587: # Lakshdeep 
            if i[0] not in temp_ids[0]:
                temp_ids[0].append(i[0])
                temp_titles[0].append(i[1])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        elif i[-4]=='Non Agri' and i[-1]=='Slow':
            if i[0] not in temp_ids[2]:
                temp_ids[1].append(i[0])
                temp_titles[1].append(i[1])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
    
    art1,art2,indA1,indA2,indx1,indx2 = get_pat_res(temp_vectors[0],temp_vectors[1])
    titles1 = get_filtered(temp_titles[0],indx1,indA1,art1)
    titles2 = get_filtered(temp_titles[1],indx2,indA2,art2)
    rndm = sample([a for a in range(len(titles1))],min(len(titles1),10))
    titles1 = [titles1[a] for a in rndm]
    rndm = sample([a for a in range(len(titles2))],min(len(titles2),10))
    titles2 = [titles2[a] for a in rndm]
    for k in titles1:
        print(k)
    print('\n\n')
    for k in titles2:
        print(k)
    print('\n\n')


Collection: Agriculture
agriculture0
Number of articles: 13 	Number of clusters: 10
Monsoon to reach Kerala in next 48 hours
Cyclonic storm building over Arabian Sea, says Met dept
Monsoon likely to hit Kerala in 4 days: IMD
Monsoon advances after lying low for over a week
Mumbai receives pre-monsoon showers, dust storm brings down temperature in North India
23 including women killed in rain, lightening-related incidents in Bihar
Conditions favourable for onset of monsoon: IMD
Monsoon hits Kerala three days early
Dry weather reported in TN, K'taka
Below average monsoon likely in other parts too
Rain lashes Kerala; Holiday for schools, colleges tomorrow
Pollution turning country's rainfall acidic, says study
Working women and working men – A country divided



agriculture2
Number of articles: 4594 	Number of clusters: 17
Vector alarm sounded at heal hubs
Man, machine take on diseased city 
Kulathummal Canal gets a fillip to improve and restore the soil quality
Stress laid on cleaning u