In [1]:
import re
import glob
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def retrieve_docs_and_clean():
    documents=[]
    for file in glob.glob('Dataset'+"//*.csv"):
        text = ""
        with open(file,'r',encoding='latin-1') as nf:
            items = nf.readlines()
            items = items[1:]
            for item in items:
                row_text = item.split(',',6)
                text=row_text[6]
        documents.append(text)
    # Clean Paragraphs
    documents_clean = []
    for d in documents:
        # Remove Unicode
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        # Remove Mentions
        document_test = re.sub(r'@\w+', '', document_test)
        # Lowercase the document
        document_test = document_test.lower()
        # Remove punctuations
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        # Lowercase the numbers
        document_test = re.sub(r'[0-9]', '', document_test)
        # Remove the doubled space
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(document_test)

    return documents_clean

In [10]:
docs = retrieve_docs_and_clean()
docs


[' i think it will try to repeal obamacare i think it will try to repeal obamaca re without any i think it will try to repeal obamacare without any idea of how to cover the million americans who have been added and who have gotten security by it i think it will try to ta ke security by it i think it will try to take america out of the climate change chords without any idea of what to do about serious issues ',
 ' reason why we need to continue china s massive renewables programme was one sign of the world s determination to carry on tackling climate change china says it will show leadership if the us does pull out of the un climate deal but even the us itself has a boom in wind and solar power the election of president trump is a setback for miss espinosa ',
 ' the organisation says he is not even sure human activity affects climate change but is all of this going to bring backjobs change but is all of this going to bring back jobs trump change but is all of this going to bring backjob

In [4]:
# Create Term-Document Matrix with TF-IDF weighting

vectorizer = TfidfVectorizer()# Instantiate a TfidfVectorizer object
X = vectorizer.fit_transform(docs)# fit the data and transform it as a vector

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())# Convert the X as transposed matrix and Create a DataFrame and set the vocabulary as the index
print(df.head())
print(df.shape)

           0    1    2    3    4    5    6    7    8    9    ...  408  409  \
aaaa       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
aaaaaa     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
abilities  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
ability    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
able       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

                410  411  412  413  414  415  416  417  
aaaa       0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
aaaaaa     0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
abilities  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
ability    0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
able       0.130441  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 418 columns]
(3595, 418)


In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,408,409,410,411,412,413,414,415,416,417
aaaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaaaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abilities,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ability,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
able,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.130441,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zellic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zika,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zip,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#Calculate the similarity using cosine similarity.
def get_similar_articles(q, df):
    print("query:", q)
    # Convert the query to a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}
    # Calculate the similarity
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    
    # Print the articles and their similarity values
    for k, v in sim_sorted:
        if v != 0.0:
            print("Similarity value:", v)
            print(docs[k])
            print()


q1 = 'trump'
get_similar_articles(q1, df)
print('-'*100)


query: trump
Similarity value: 0.10738742077858963
 ainsley this is a fox news alert john roberts our correspondent in washington has one source that is telling him and is he working on a second source that the president president trump is pulling out of the paris climate change deal stuart varney joins us from the business channel to tell us what this means for you and your family 

Similarity value: 0.08155280774303338
 emergency committee cobra the third in the past a hours good progress with the operation has been made with a number of arrests made over night and that will continue we have now gone to a critical level in terms of the threat we ll have all the latest live with huw in manchester the other headlines this lunchtime president trump meets the pope at the vatican and receives a signed copy of the pontiff s views on climate change and profits at marks and spencer fell by almost two thirds last year 

-------------------------------------------------------------------------