In [None]:
# Import pandas for dataframe 
# Import pprint for printing the outcomes 
import pandas as pd 
from pprint import pprint 

# get dataset from web url
df = pd.read_csv('https://raw.githubusercontent.com/okanbulut/blog/master/data_and_codes/train_rel_2.tsv')
# write a dataset to csv file
df.to_csv("csv/train_rel_2.tsv", sep="\t", header=True, index=False)


In [None]:
# Import train_rel_2.tsv into Python
with open('csv/train_rel_2.tsv', 'r') as f:
    lines = f.readlines()
    columns = lines[0].split('\t')
    data = []
    response_id= []
    score = []
    for line in lines[1:]:
        temp = line.split('\t')
        if temp[1] == '1':
            data.append(temp[-1])
            response_id.append(int(temp[0]))
            score.append(int(temp[2]))
        else: 
            None

# Construct a dataframe ("doc") which includes the response_id, responses, and the score        
doc = pd.DataFrame(list(zip(response_id, data, score)))
doc.columns = ['id', 'response', 'score']

In [None]:
# Preview the first response in the data set
print('Sample response 1:')
pprint(doc.response.values[0]) 

# Preview the first 5 lines in the data set
doc.head(5)

In [None]:
# Activate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorizer
vect = CountVectorizer()  
vects = vect.fit_transform(doc.response)

# Select the first five rows from the data set
td = pd.DataFrame(vects.todense()).iloc[:5]  
td.columns = vect.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Doc '+str(i) for i in range(1, 6)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

# Top 25 words 
term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] 

# Print the first 10 rows 
print(term_document_matrix.drop(columns=['total_count']).head(10))

In [None]:
term_document_matrix['total_count'].plot.bar()

In [None]:
# Locate the and to in the documents
term_document_matrix.loc[['the', 'to']].T

In [None]:
term_document_matrix.drop(columns=['total_count']).T.plot.scatter(x='the', y='to')

In [None]:
# Activate math
import math

# Define a cosine similarity function
def cosine_similarity(a,b):
    "compute cosine similarity of v1 to v2: (a dot b)/{||a||*||b||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(a)):
        x = a[i]; y = b[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

In [None]:
# Activate numpy
import numpy as np 

# Save the similarity index between the documents
def pair(s):
    for i, v1 in enumerate(s):
        for j in range(i+1, len(s)):
            yield [v1, s[j]]

dic={} 
for (a,b) in list(pair(['Doc 1', 'Doc 2', 'Doc 3', 'Doc 4', 'Doc 5'])):
  dic[(a,b)] = cosine_similarity(term_document_matrix[a].tolist(), term_document_matrix[b].tolist())

# Print the cosine similarity index
pprint(dic)

In [None]:
documents= ['Doc 1', 'Doc 2', 'Doc 3', 'Doc 4', 'Doc 5']
final_df = pd.DataFrame(np.asarray([[(dic[(x,y)] if (x,y) in dic else 0) for y in documents] for x in documents]))
final_df.columns =  documents
final_df.index = documents 

import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_xticks(np.arange(len(documents)))
ax.set_yticks(np.arange(len(documents)))
ax.set_xticklabels(documents)
ax.set_yticklabels(documents)
ax.matshow(final_df, cmap='seismic')
for (i, j), z in np.ndenumerate(final_df):
  if z != 0 :
    ax.text(j, i, '{:0.2f}'.format(z), ha='center', va='center',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
  else:
    None
fig.suptitle('Cosine Similarity Index between the Documents')
plt.show()

