In [1]:
#Mongo imports
import pymongo
from pymongo import MongoClient
# Semantic Analysis imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

## Get Mongo Data

In [2]:
# creates Mongo cursor of all data
def mongo_cursor(collection_name,db_name='project4_database',IP='52.38.12.228',port=27016):
    client = MongoClient(IP, port)
    db_ref = client[db_name]
    coll_ref = db_ref[collection_name]
    
    #query to return all records
    cursor = coll_ref.find({})
    
    return cursor

In [3]:
# converts Mongo data to dictionary stored locally
def extract_mongo_data(cursor,key_field='page_title',value_field='page_text' ):
    data_dict = {}
    
    # only include required fields in dictionary
    for obj in cursor:
        data_dict[obj[key_field]] = obj[value_field]
        
    return data_dict

## Declare Global Variables

In [4]:
ML_collection_name = 'machine_learning_collection'
BS_collection_name = 'business_software_collections'

# tune text_vectorizer.  Used n_gram 2,2 to maintain more context
text_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english',ngram_range=(2,2))
# set at 120 because thats all my comp memory could handle, but during testing results improved higher this went
SVD = TruncatedSVD(120)

## Build Document Matrix

In [5]:
def build_doc_matrix(data,text_vectorizer=text_vectorizer,SVD=SVD):
    
    doc_matrix = text_vectorizer.fit_transform(data)
    
    # perform SVD/PCA to reduce the columns/features
    svd_matrix = SVD.fit_transform(doc_matrix)
    
    return svd_matrix


## Search and Return Results

In [6]:
def transform_search_results(search_terms,text_vectorizer=text_vectorizer,SVD=SVD):
    
    search_dict = {'search':search_terms}
    
    # only need to transform data to ensure shape is the same
    search_matrix = text_vectorizer.transform(search_dict.values())
    
    search_svd = SVD.transform(search_matrix)
    
    return search_svd

## Prepare Data

In [7]:
def prep_data(collection_name, search_terms):
    
    
    # call extract_mongo & mongo_cursor
    data_dict = extract_mongo_data(mongo_cursor(collection_name))
    
    data = data_dict.values()
    
    #call build document matric
    svd_matrix = build_doc_matrix(data)
    
    # call transform search results
    search_svd = transform_search_results(search_terms)
    
    return svd_matrix,search_svd,data_dict
    

## Search and Display Results

In [8]:
def search_articles(search_terms,collection_name,number_of_results=5):
    
    svd_matrix,search_svd,data_dict = prep_data(collection_name,search_terms)
    
    df_index = [num for num in range(len(data_dict))]
    
    results_df = pd.DataFrame(index=df_index)
    
    results_df['page'] = data_dict.keys()
    
    results_df['similiarity'] = cosine_similarity(svd_matrix,search_svd)
    
    return results_df.sort_values('similiarity',ascending=False).head(number_of_results)

In [19]:
# Machine learning colleciton test
search_articles('machine learning',ML_collection_name)

Unnamed: 0,page,similiarity
2,Machine learning,0.766729
7,Adversarial machine learning,0.701522
3,Outline of machine learning,0.627945
478,Apache SystemML,0.573717
813,Michael I. Jordan,0.570788


In [None]:
# Business software colleciton test
search_articles('QuickBooks',BS_collection_name)