In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 20 18:00:07 2019

@author: smkj33
"""



"""
Previous Version: 
    bkp_model_20_Aug.py

-- Removing html filter using HTML.parser 
-- Removing related import statement
-- Beautiful soup will be used in it's place'


    bkp_model_19_Oct.py 
    
--  Replaced Potter Stemmer with Snowball Stemmer
--  Added Cosine similarity with TS-SS.

"""


import pandas as pd
import re
import math
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.stem import PorterStemmer
from porter2stemmer import Porter2Stemmer

from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances



from db_functions import *
from queries import *





#-------------------------------------#
# MODEL CREATE HELPER FUNCTIONS
#-------------------------------------#



def filter_html(text):
    soup = BeautifulSoup(text, features="html5lib")
    # text = re.sub('[^a-z\s]', '',soup.get_text(separator=' ').lower())
    text = soup.get_text(separator=' ')
    return text


def text_stemmer (txt, stemmer):
    token_words=word_tokenize(txt)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)


def clean_tags(x):
    if isinstance(x, str):
        return str.lower(x.replace(" ", "")).replace(","," ")

    else:
        return ''


def get_theta(cosine_similarity):
    sim = np.divide(np.trunc(np.multiply(cosine_similarity, 100000000000000)), 100000000000000)
    angles = np.arccos(sim) + math.radians(10)
    return angles


def get_magnitude(matrix):
    magnitude = np.sqrt(matrix.multiply(matrix).sum(1))
    return magnitude


def get_euclidean(vectors):
    magnitudes = euclidean_distances(vectors)
    return magnitudes


#-------------------------------------#
# MODEL EXPORT HELPER FUNCTIONS
#-------------------------------------#

def matrix_to_json(matrix):
    df = pd.DataFrame(matrix.apply(lambda row: row.to_json(), axis=1), columns = ['data_col'])
    df['local_id'] = df.index
    return df


def export_content_cosine_similarity (similarity_matrix):
    df = matrix_to_json(similarity_matrix)
    sql = export_content_cosine_similarity_query()
    export_data(df, sql)


def export_title_similarity (similarity_matrix):
    df = matrix_to_json(similarity_matrix)
    sql = export_title_similarity_query()
    export_data(df, sql)


def export_cat_tags_similarity (similarity_matrix):
    df = matrix_to_json(similarity_matrix)
    sql = export_cat_tags_similarity_query()
    export_data(df, sql)


def export_content_theta(angles):
    df = matrix_to_json(angles)
    sql = export_content_theta_query()
    export_data(df, sql)


def export_content_distance(distance):
    df = matrix_to_json(distance)
    sql = export_content_distance_query()
    export_data(df, sql)


def export_content_magnitude(vector_size):
    df = pd.DataFrame(vector_size, columns=['data_col'])
    sql = export_content_magnitude_query()
    export_data(df, sql)


#-------------------------------------#
# MODEL CREATE DRIVER
#-------------------------------------#


truncate_similarities()
article_master = import_content()



## PREPROCESS CONTENT
print("Previous Model Truncated.")
print("Pre-processing....")


# REDUCE CONTENT:
article_master['reduced_content'] = article_master.apply\
    (lambda row: re.sub('[^a-z\s]', '',filter_html(row.bodytext).lower()), axis = 1)

#-- Potential Global Variable

snowball = Porter2Stemmer()

article_master['stemmed_content'] = article_master.apply\
    (lambda row: text_stemmer(row.reduced_content, snowball), axis = 1)

article_master['stemmed_content'] = article_master['stemmed_content'].fillna('')



# REDUCE TITLE:
# It must be noted that numbers are removed from the content and not from the title
article_master['reduced_title'] = article_master.apply\
    (lambda row: re.sub('[^a-z0-9\s]', '',row.title.lower()), axis = 1)

article_master['stemmed_title'] = article_master.apply\
    (lambda row: text_stemmer(row.reduced_title, snowball), axis = 1)



# REDUCE TAGS AND CATEGORY
article_master['reduced_category'] = article_master['category'].apply(clean_tags)
article_master['reduced_tags'] = article_master['tags'].apply(clean_tags)
article_master["meta_soup"] = article_master["reduced_category"] + ' ' + article_master['reduced_tags']




"""
#-- At this point the newly stemmed metadata content can be written to the database.
"""

#-------------------------------------#
## Preprocess Content - End
#-------------------------------------#



MySQL connection is closed
MySQL connection is closed
Previous Model Truncated.
Pre-processing....


'\n#-- At this point the newly stemmed metadata content can be written to the database.\n'

In [2]:
print("Creating new Model.")

# MODEL CREATION

# Define a TF-IDF Vectorizer Object for Un-normalised TF-IDF vectors
# Remove all english stop words such as 'the', 'a'


tfidf = TfidfVectorizer(stop_words = 'english', norm = None)
tfidf_vectors = tfidf.fit_transform(article_master['stemmed_content'])


cosine_sim_content = cosine_similarity(tfidf_vectors)

Creating new Model.


In [3]:

# Export content similarity matrix
df = pd.DataFrame.from_records(cosine_sim_content)
export_content_cosine_similarity(df)
print("Exported Content Cosine Similarity Matrix .")

MySQL connection is closed
Exported Content Cosine Similarity Matrix .


In [4]:
# Theta, Euclidean Distance and Magnitude of TF-IDF vectors: required for TS-SS similarity
angles = get_theta(cosine_sim_content)
euclidean_distance = get_euclidean(tfidf_vectors)
vector_size = get_magnitude(tfidf_vectors)

In [5]:
# Export Theta matrix
df = pd.DataFrame.from_records(angles)
export_content_theta(df)

MySQL connection is closed


In [6]:
# Export Euclidean Distance  matrix
df = pd.DataFrame.from_records(euclidean_distance)
export_content_distance(df)

MySQL connection is closed


In [60]:
vector_size = np.sqrt(tfidf_vectors.multiply(tfidf_vectors).sum(1))

In [138]:
df = pd.DataFrame.from_records(vector_size)
df


Unnamed: 0,0
0,127.418847
1,67.624464
2,62.065443
3,143.204814
4,53.245535
5,46.637391
6,58.557181
7,105.572534
8,248.123607
9,62.782771


In [139]:
# df = pd.DataFrame.from_records(vector_size, columns=['data_col'])
df = pd.DataFrame(df.apply(lambda row: row.to_json(), axis=1), columns = ['data_col'])

In [140]:
 df['local_id'] = df.index

In [141]:
df

Unnamed: 0,data_col,local_id
0,"{""0"":127.4188473234}",0
1,"{""0"":67.6244639575}",1
2,"{""0"":62.0654428818}",2
3,"{""0"":143.2048138668}",3
4,"{""0"":53.2455353861}",4
5,"{""0"":46.6373914753}",5
6,"{""0"":58.5571806358}",6
7,"{""0"":105.5725344669}",7
8,"{""0"":248.1236070731}",8
9,"{""0"":62.7827711249}",9


In [98]:
import pandas as pd
import numpy as np

# Requried for connection to MySQL db
import pymysql
from db_config import *
from queries import *

conn = pymysql.connect(**connection_properties)
cursor = conn.cursor()
    
    

In [145]:
sql = export_content_magnitude_query()

In [144]:
df.apply(lambda row: cursor.execute(sql, (row.local_id, row.data_col)), axis = 1)

IntegrityError: (1062, "Duplicate entry '0' for key 'PRIMARY'", 'occurred at index 0')

In [146]:
conn.commit()

In [126]:
df.iloc[[2]].data_col

2    62.065443
Name: data_col, dtype: float64

In [148]:

export_data(df, sql)

MySQL connection is closed


In [39]:
# Export Vector Magnitudes

sql = export_content_magnitude_query()

export_data(df, sql)

Error while connecting to MySQL ("'matrix' object has no attribute 'translate'", 'occurred at index 0')


In [76]:
vector_size.astype(int)

matrix([[127],
        [ 67],
        [ 62],
        [143],
        [ 53],
        [ 46],
        [ 58],
        [105],
        [248],
        [ 62],
        [150],
        [ 79],
        [ 86],
        [130],
        [133],
        [174],
        [ 77],
        [221],
        [130],
        [166],
        [123],
        [143],
        [ 84],
        [124],
        [ 87],
        [ 79],
        [108],
        [119],
        [ 73],
        [137],
        [286],
        [105],
        [119],
        [173],
        [222],
        [107],
        [134],
        [106],
        [ 76],
        [149],
        [203],
        [138],
        [301],
        [ 93],
        [174],
        [142],
        [105],
        [ 90],
        [ 38],
        [ 84],
        [157],
        [ 68],
        [111],
        [121],
        [ 50],
        [230],
        [ 80],
        [ 84],
        [ 48],
        [189],
        [131],
        [105],
        [372],
        [176],
        [123],
        [178],
        [1

In [43]:
df = pd.DataFrame.from_records(cosine_sim_content)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,400,401,402,403,404,405,406,407,408,409
0,1.000000,0.104482,0.082914,0.061083,0.047005,0.124568,0.032556,0.092386,0.515766,0.057733,...,0.071749,0.056079,0.062083,0.210171,0.136310,0.156927,0.338101,0.122195,0.173731,0.254307
1,0.104482,1.000000,0.217075,0.160044,0.117866,0.139348,0.193486,0.081696,0.157214,0.144680,...,0.064025,0.093637,0.113956,0.034278,0.060457,0.051357,0.038987,0.013311,0.026930,0.088829
2,0.082914,0.217075,1.000000,0.161446,0.158887,0.155142,0.175954,0.084396,0.145292,0.283712,...,0.068752,0.073040,0.113915,0.012304,0.052209,0.058223,0.037622,0.014409,0.028985,0.071749
3,0.061083,0.160044,0.161446,1.000000,0.115371,0.117243,0.111671,0.079302,0.113797,0.134122,...,0.107152,0.079064,0.099320,0.035586,0.023210,0.083986,0.036157,0.031647,0.013316,0.049020
4,0.047005,0.117866,0.158887,0.115371,1.000000,0.092860,0.113998,0.085072,0.089066,0.098925,...,0.078345,0.059542,0.090554,0.010974,0.032596,0.058066,0.033699,0.016796,0.007089,0.044744
5,0.124568,0.139348,0.155142,0.117243,0.092860,1.000000,0.061158,0.100885,0.157906,0.111670,...,0.061715,0.067939,0.072643,0.026925,0.093754,0.093286,0.058142,0.049888,0.018824,0.075193
6,0.032556,0.193486,0.175954,0.111671,0.113998,0.061158,1.000000,0.062641,0.087022,0.070677,...,0.051870,0.044008,0.074286,0.008953,0.021030,0.019947,0.004657,0.019594,0.019213,0.048920
7,0.092386,0.081696,0.084396,0.079302,0.085072,0.100885,0.062641,1.000000,0.078609,0.109317,...,0.070357,0.047382,0.089658,0.047537,0.040286,0.048535,0.021111,0.042749,0.032744,0.040248
8,0.515766,0.157214,0.145292,0.113797,0.089066,0.157906,0.087022,0.078609,1.000000,0.119481,...,0.077280,0.096891,0.082285,0.129284,0.244774,0.157856,0.257348,0.094910,0.161852,0.278119
9,0.057733,0.144680,0.283712,0.134122,0.098925,0.111670,0.070677,0.109317,0.119481,1.000000,...,0.078075,0.051716,0.084020,0.000000,0.026910,0.043121,0.039872,0.013201,0.014887,0.033786


In [None]:


print("Exported Content TS-SS Similarity Matrices .")

# Define a TF-IDF Vectorizer Object for normalised TF-IDF vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_title = tfidf.fit_transform(article_master['stemmed_title'])
cosine_sim_title = linear_kernel(tfidf_matrix_title, tfidf_matrix_title)

# Export title similarity matrix
df = pd.DataFrame.from_records(cosine_sim_title)
export_title_similarity(df)
print("Exported Title Cosine Similarity Matrix .")


#-- Potential Global Variable
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(article_master["meta_soup"])
cosine_sim_cat_tags = cosine_similarity(count_matrix, count_matrix)

# Export title similarity matrix
df = pd.DataFrame.from_records(cosine_sim_cat_tags)
export_cat_tags_similarity(df)
print("Exported Tags/Category Cosine Similarity Matrix .")


article_map = (article_master[['article_id','title']].copy()).drop_duplicates()
article_map['local_id'] = article_map.index

# Export article_map
export_map(article_map)

print("Model Created")