In [2]:
import pandas as pd

# TfIdfVectorizer from scikit-learn for text
from sklearn.feature_extraction.text import TfidfVectorizer


# Import CountVectorizer to create count matrix for tags
# This is an alternative to tfidf
from sklearn.feature_extraction.text import CountVectorizer


# Requried to tokenise the text before Stemming
from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.stem import PorterStemmer
from porter2stemmer import Porter2Stemmer

# Import linear_kernel for Cosine Similarity calculation of bodytext and title
# This wil be applied on a tfidf matrix and NOT a count matrix
from sklearn.metrics.pairwise import linear_kernel

# Compute the Cosine Similarity matrix based on a count_matrix
from sklearn.metrics.pairwise import cosine_similarity



# Funtions interacting with the database
from db_functions import *

# DB Queries generated in here
from queries import *

from bs4 import BeautifulSoup


import re

In [3]:
#-------------------------------------#
# MODEL CREATE HELPER FUNCTIONS
#-------------------------------------#



def filter_html(text):
    soup = BeautifulSoup(text, features="html5lib")
    # text = re.sub('[^a-z\s]', '',soup.get_text(separator=' ').lower())
    text = soup.get_text(separator=' ')
    return text



def text_stemmer (txt, stemmer):
    token_words=word_tokenize(txt)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)



def clean_tags(x):
    if isinstance(x, str):
        return str.lower(x.replace(" ", "")).replace(","," ")

    else:
        return ''




#-------------------------------------#
# MODEL EXPORT HELPER FUNCTIONS
#-------------------------------------#

def matrix_to_jason(matrix):
    df = pd.DataFrame(matrix.apply(lambda row: row.to_json(), axis=1), columns = ['jsol_col'])
    df['local_id'] = df.index
    return df


def export_content_similarity (similarity_matrix):
    df = matrix_to_jason(similarity_matrix)
    sql = export_content_similarity_query()
    export_data(df, sql)


def export_title_similarity (similarity_matrix):
    df = matrix_to_jason(similarity_matrix)
    sql = export_title_similarity_query()
    export_data(df, sql)


def export_cat_tags_similarity (similarity_matrix):
    df = matrix_to_jason(similarity_matrix)
    sql = export_cat_tags_similarity_query()
    export_data(df, sql)






In [4]:
#-------------------------------------#
# MODEL CREATE DRIVER
#-------------------------------------#


truncate_similarities()
article_master = import_content()



## PREPROCESS CONTENT



# REDUCE CONTENT:
article_master['reduced_content'] = article_master.apply\
    (lambda row: re.sub('[^a-z\s]', '',filter_html(row.bodytext).lower()), axis = 1)

#-- Potential Global Variable

# porter = PorterStemmer()
snowball = Porter2Stemmer()

article_master['stemmed_content'] = article_master.apply\
    (lambda row: text_stemmer(row.reduced_content, snowball), axis = 1)

article_master['stemmed_content'] = article_master['stemmed_content'].fillna('')



# REDUCE TITLE:
# It must be noted that numbers are removed from the content and not from the title
article_master['reduced_title'] = article_master.apply\
    (lambda row: re.sub('[^a-z0-9\s]', '',row.title.lower()), axis = 1)

article_master['stemmed_title'] = article_master.apply\
    (lambda row: text_stemmer(row.reduced_title, snowball), axis = 1)



# REDUCE TAGS AND CATEGORY
article_master['reduced_category'] = article_master['category'].apply(clean_tags)
article_master['reduced_tags'] = article_master['tags'].apply(clean_tags)
article_master["meta_soup"] = article_master["reduced_category"] + ' ' + article_master['reduced_tags']


"""
#-- At this point the newly stemmed metadata content can be written to the database.
"""

#-------------------------------------#
## Preprocess Content - End
#-------------------------------------#

MySQL connection is closed
MySQL connection is closed


'\n#-- At this point the newly stemmed metadata content can be written to the database.\n'

In [11]:
import math
from sklearn.metrics.pairwise import euclidean_distances


def get_theta(cosine_similarity):
    sim = np.divide(np.trunc(np.multiply(cosine_similarity, 100000000000000)), 100000000000000)
    angles = np.arccos(sim) + math.radians(10)
    return angles

def get_magnitude(matrix):
    magnitude = np.sqrt(matrix.multiply(matrix).sum(1))
    return magnitude


def get_euclidean(vectors):
    magnitudes = euclidean_distances(vectors)
    return magnitudes    

In [12]:

tfidf = TfidfVectorizer(stop_words = 'english', norm = None)
tfidf_vectors = tfidf.fit_transform(article_master['stemmed_content'])
# TO-DO: Write vectors to the db.


cosine_sim_content = cosine_similarity(tfidf_vectors)
angles = get_theta(cosine_sim_content)
# TO-DO: Write theta values to the db.

vector_size = get_magnitude(tfidf_vectors)
# TO-DO: Write vector_size values to the db.


euclidean_distance = get_euclidean(tfidf_vectors)
# TO-DO: Write euclidean_distance values to the db.

# This completes model creation.

In [None]:
def get_ts_ss_similarity():
    

In [69]:
def get_triangle_similarity(magnitude, angles):
    
    

<410x1 sparse matrix of type '<class 'numpy.float64'>'
	with 410 stored elements in Compressed Sparse Row format>

In [10]:
get_euclidean(tfidf_vectors)

array([[  0.        , 137.86974857, 137.0264865 , ..., 126.74526777,
        126.77152799, 123.63912415],
       [137.86974857,   0.        ,  81.25884899, ...,  71.4583223 ,
         77.72380463,  69.31166048],
       [137.0264865 ,  81.25884899,   0.        , ...,  66.22403245,
         72.95078988,  64.44082863],
       ...,
       [126.74526777,  71.4583223 ,  66.22403245, ...,   0.        ,
         45.01046088,  31.01588147],
       [126.77152799,  77.72380463,  72.95078988, ...,  45.01046088,
          0.        ,  38.45708066],
       [123.63912415,  69.31166048,  64.44082863, ...,  31.01588147,
         38.45708066,   0.        ]])