In [1]:
from elasticsearch import Elasticsearch
from itertools import combinations
from os import listdir
import numpy as np
import pandas as pd
import TFIDFViewer_m as tfidf
from ssl import create_default_context
context = create_default_context(cafile="/home/mohana/http_ca.crt")
client = Elasticsearch(timeout=1000,use_ssl=True,ssl_context=context, scheme = "https",http_auth = ("elastic", "h_9RfOqHmS-v5-ZV9Lk7"))


## Compare texts within groups

In newspapers we have 20 groups:['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] and in each of these groups we will select 5 random texts to compare comparision in each group. At the end we have 200 combination of texts in different news subjects.

In [2]:
def within_groups_samples(path, n_sample):
    groups = sorted(listdir(path))
    group_text1_text2 = [[] for _ in range(3)]
    for i in groups:
        samples = np.random.choice(listdir(path +'/'+ i), size=n_sample, replace=False)
        #consider all combination of texts, forexample([0085,0147],[0085, 0524])
        for text1, text2 in combinations(samples, 2):
            group_text1_text2[0].append(i)
            group_text1_text2[1].append(text1)
            group_text1_text2[2].append(text2)
    df_within_groups = pd.DataFrame({'Group': group_text1_text2[0], 'Text1': group_text1_text2[1], 'Text2': group_text1_text2[2]})
    return df_within_groups


In [3]:
path_news = '20_newsgroups'
n_sample =5
df_total_within= within_groups_samples(path_news, n_sample)
df_total_within

Unnamed: 0,Group,Text1,Text2
0,alt.atheism,0000893,0000468
1,alt.atheism,0000893,0000747
2,alt.atheism,0000893,0000436
3,alt.atheism,0000893,0000248
4,alt.atheism,0000468,0000747
...,...,...,...
195,talk.religion.misc,0019565,0019392
196,talk.religion.misc,0019565,0019225
197,talk.religion.misc,0019404,0019392
198,talk.religion.misc,0019404,0019225


## Compare texts between groups

To compare different groups, we will consider all comination of groups and for each group, we will select 10 random texts for each of them. To not have repetitive samples in our comparison we considered different random.seed for each iteration of combinations.At the end we have 950 combinations. 

In [4]:
def between_groups_samples(path, n_sample):
    groups = sorted(listdir(path))
    group1_text1_group2_text2 = [[] for _ in range(4)]
    counter = 0
    #All combinations of groups
    for group1, group2 in combinations(groups, 2):
        np.random.seed(1 + counter)
        sample_group1 = np.random.choice(listdir(path +'/' + group1), size=n_sample, replace=False)
        sample_group2 = np.random.choice(listdir(path +'/' + group2), size=n_sample, replace=False)
        for i in range(n_sample):
            group1_text1_group2_text2[0].append(group1)
            group1_text1_group2_text2[1].append(sample_group1[i])
            group1_text1_group2_text2[2].append(group2)
            group1_text1_group2_text2[3].append(sample_group2[i])
        counter += 1

    df_between_groups = pd.DataFrame({'Group1': group1_text1_group2_text2[0], 'Text1': group1_text1_group2_text2[1], 
                                      'Group2': group1_text1_group2_text2[2], 'Text2': group1_text1_group2_text2[3]})
    return df_between_groups

In [5]:
df_total_btw = between_groups_samples(path_news, n_sample)
print(df_total_btw)
#to check different combinations in group alt.atheism
#df_total_btw.loc[df_total_btw['Group1']=="alt.atheism"]

                 Group1    Text1              Group2    Text2
0           alt.atheism  0000143       comp.graphics  0001936
1           alt.atheism  0000426       comp.graphics  0001888
2           alt.atheism  0000110       comp.graphics  0001951
3           alt.atheism  0000136       comp.graphics  0001281
4           alt.atheism  0000907       comp.graphics  0001673
..                  ...      ...                 ...      ...
945  talk.politics.misc  0018742  talk.religion.misc  0019792
946  talk.politics.misc  0018074  talk.religion.misc  0019226
947  talk.politics.misc  0018101  talk.religion.misc  0019781
948  talk.politics.misc  0018196  talk.religion.misc  0019389
949  talk.politics.misc  0018835  talk.religion.misc  0019934

[950 rows x 4 columns]


## Calculate cosine similarity between groups

Based on functions that defined in TFIDFViewer_m, first we should find the id of a document in the index and then calculate cosine similarity.

In [6]:
def calculate_cos_sim(client_, index_, path1_, path2_):
    file_group1_id = tfidf.search_file_by_path(client_, index_, path1_)
    file_group2_id = tfidf.search_file_by_path(client_, index_, path2_)

    file1_tw = tfidf.toTFIDF(client_, index_, file_group1_id)
    file2_tw = tfidf.toTFIDF(client_, index_, file_group2_id )

    return tfidf.cosine_similarity(file1_tw, file2_tw)

We condider index news that considered token =letter and filter= lowercase

In [7]:
index = 'news'

For different combination of groups we will calculate mean of similarity, so we will have 190 rows at the end, because we consider mean of each group and our first table that has 950 rows.

In [8]:
total_similarity = []
for ix, row in df_total_btw.iterrows():
    path1 = path_news + '/' + row.Group1 + '/' + row.Text1
    path2 = path_news + '/' + row.Group2 + '/' + row.Text2
    total_similarity.append(calculate_cos_sim(client, index, path1, path2))

between_groups_sim = df_total_btw.copy()
between_groups_sim['similarity'] = total_similarity
between_groups_sim = between_groups_sim.groupby(['Group1', 'Group2'])['similarity'].mean()

Based on our result, highest similarity exist between "talk.politics.guns" & "talk.politics.misc" and lowest similairty exist between "comp.windows.x" & "rec.autos".

In [11]:
df_between_sorted=pd.DataFrame(between_groups_sim).sort_values('similarity')
df_between_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,similarity
Group1,Group2,Unnamed: 2_level_1
comp.windows.x,rec.autos,0.002747
comp.os.ms-windows.misc,sci.electronics,0.003396
alt.atheism,misc.forsale,0.003868
misc.forsale,rec.sport.hockey,0.004360
alt.atheism,comp.windows.x,0.004720
...,...,...
talk.politics.mideast,talk.religion.misc,0.035132
soc.religion.christian,talk.religion.misc,0.035421
soc.religion.christian,talk.politics.mideast,0.036313
alt.atheism,talk.religion.misc,0.039683


In [12]:
test = ["alt.atheism"]
x = df_between_sorted[df_between_sorted["Group1"].isin(test)]
x

KeyError: 'Group1'

## Calculate cosine similarity within groups

In [None]:
nw_similarity = []
for ix, row in df_total_within.iterrows():
    path1 = path_news + '/' + row.Group + '/' + row.Text1
    path2 = path_news + '/' +row.Group + '/' + row.Text2
    nw_similarity.append(calculate_cos_sim(client, index, path1, path2))

within_groups_sim= df_total_within.copy()
within_groups_sim['similarity'] = nw_similarity
mean_within = within_groups_sim.groupby('Group')['similarity'].mean()
print(mean_within)

In [None]:
df_within_sorted = pd.DataFrame(mean_within).sort_values('similarity', ascending=False)
df_within_sorted

## Compare the result of within and between groups