In [1]:
import pandas as pd

# TfIdfVectorizer from scikit-learn for text
from sklearn.feature_extraction.text import TfidfVectorizer


# Import CountVectorizer to create count matrix for tags
# This is an alternative to tfidf
from sklearn.feature_extraction.text import CountVectorizer


# Requried to tokenise the text before Stemming
from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.stem import PorterStemmer
from porter2stemmer import Porter2Stemmer

# Import linear_kernel for Cosine Similarity calculation of bodytext and title
# This wil be applied on a tfidf matrix and NOT a count matrix
from sklearn.metrics.pairwise import linear_kernel

# Compute the Cosine Similarity matrix based on a count_matrix
from sklearn.metrics.pairwise import cosine_similarity



# Funtions interacting with the database
from db_functions import *

# DB Queries generated in here
from queries import *

from bs4 import BeautifulSoup


import re

In [2]:
#-------------------------------------#
# MODEL CREATE HELPER FUNCTIONS
#-------------------------------------#



def filter_html(text):
    soup = BeautifulSoup(text, features="html5lib")
    # text = re.sub('[^a-z\s]', '',soup.get_text(separator=' ').lower())
    text = soup.get_text(separator=' ')
    return text



def text_stemmer (txt, stemmer):
    token_words=word_tokenize(txt)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)



def clean_tags(x):
    if isinstance(x, str):
        return str.lower(x.replace(" ", "")).replace(","," ")

    else:
        return ''




#-------------------------------------#
# MODEL EXPORT HELPER FUNCTIONS
#-------------------------------------#

def matrix_to_jason(matrix):
    df = pd.DataFrame(matrix.apply(lambda row: row.to_json(), axis=1), columns = ['jsol_col'])
    df['local_id'] = df.index
    return df


def export_content_similarity (similarity_matrix):
    df = matrix_to_jason(similarity_matrix)
    sql = export_content_similarity_query()
    export_data(df, sql)


def export_title_similarity (similarity_matrix):
    df = matrix_to_jason(similarity_matrix)
    sql = export_title_similarity_query()
    export_data(df, sql)


def export_cat_tags_similarity (similarity_matrix):
    df = matrix_to_jason(similarity_matrix)
    sql = export_cat_tags_similarity_query()
    export_data(df, sql)






In [9]:
#-------------------------------------#
# MODEL CREATE DRIVER
#-------------------------------------#


truncate_similarities()
article_master = import_content()



## PREPROCESS CONTENT



# REDUCE CONTENT:
article_master['reduced_content'] = article_master.apply\
    (lambda row: re.sub('[^a-z\s]', '',filter_html(row.bodytext).lower()), axis = 1)

#-- Potential Global Variable

# porter = PorterStemmer()
snowball = Porter2Stemmer()

article_master['stemmed_content'] = article_master.apply\
    (lambda row: text_stemmer(row.reduced_content, snowball), axis = 1)

article_master['stemmed_content'] = article_master['stemmed_content'].fillna('')



# REDUCE TITLE:
# It must be noted that numbers are removed from the content and not from the title
article_master['reduced_title'] = article_master.apply\
    (lambda row: re.sub('[^a-z0-9\s]', '',row.title.lower()), axis = 1)

article_master['stemmed_title'] = article_master.apply\
    (lambda row: text_stemmer(row.reduced_title, snowball), axis = 1)



# REDUCE TAGS AND CATEGORY
article_master['reduced_category'] = article_master['category'].apply(clean_tags)
article_master['reduced_tags'] = article_master['tags'].apply(clean_tags)
article_master["meta_soup"] = article_master["reduced_category"] + ' ' + article_master['reduced_tags']




"""
#-- At this point the newly stemmed metadata content can be written to the database.
"""

#-------------------------------------#
## Preprocess Content - End
#-------------------------------------#



# MODEL CREATION

# Define a TF-IDF Vectorizer Object.
# Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_content = tfidf.fit_transform(article_master['stemmed_content'])


# Create additional step that uses TS-SS similarity.
cosine_sim_content = linear_kernel(tfidf_matrix_content, tfidf_matrix_content)

# Export content similarity matrix
df = pd.DataFrame.from_records(cosine_sim_content)
export_content_similarity(df)




tfidf_matrix_title = tfidf.fit_transform(article_master['stemmed_title'])
cosine_sim_title = linear_kernel(tfidf_matrix_title, tfidf_matrix_title)

# Export title similarity matrix
df = pd.DataFrame.from_records(cosine_sim_title)
export_title_similarity(df)



#-- Potential Global Variable
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(article_master["meta_soup"])
cosine_sim_cat_tags = cosine_similarity(count_matrix, count_matrix)

# Export title similarity matrix
df = pd.DataFrame.from_records(cosine_sim_cat_tags)
export_cat_tags_similarity(df)



article_map = (article_master[['article_id','title']].copy()).drop_duplicates()
article_map['local_id'] = article_map.index

# Export article_map
export_map(article_map)

print("Model Created")

MySQL connection is closed
MySQL connection is closed
MySQL connection is closed
MySQL connection is closed
MySQL connection is closed
MySQL connection is closed
Model Created


In [10]:
cosine_sim_content

array([[1.        , 0.10448185, 0.08291432, ..., 0.1221949 , 0.1737309 ,
        0.25430651],
       [0.10448185, 1.        , 0.2170747 , ..., 0.01331078, 0.02692974,
        0.08882918],
       [0.08291432, 0.2170747 , 1.        , ..., 0.01440918, 0.02898548,
        0.07174855],
       ...,
       [0.1221949 , 0.01331078, 0.01440918, ..., 1.        , 0.08540134,
        0.10624187],
       [0.1737309 , 0.02692974, 0.02898548, ..., 0.08540134, 1.        ,
        0.35348908],
       [0.25430651, 0.08882918, 0.07174855, ..., 0.10624187, 0.35348908,
        1.        ]])

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(stop_words='english')
trainVectorizerArray = vectorizer.fit_transform(article_master['stemmed_content'])
transformer = TfidfTransformer()
res = transformer.fit_transform(trainVectorizerArray)
cos_sim = linear_kernel(res, res)

In [21]:
cos_sim

array([[1.        , 0.10448185, 0.08291432, ..., 0.1221949 , 0.1737309 ,
        0.25430651],
       [0.10448185, 1.        , 0.2170747 , ..., 0.01331078, 0.02692974,
        0.08882918],
       [0.08291432, 0.2170747 , 1.        , ..., 0.01440918, 0.02898548,
        0.07174855],
       ...,
       [0.1221949 , 0.01331078, 0.01440918, ..., 1.        , 0.08540134,
        0.10624187],
       [0.1737309 , 0.02692974, 0.02898548, ..., 0.08540134, 1.        ,
        0.35348908],
       [0.25430651, 0.08882918, 0.07174855, ..., 0.10624187, 0.35348908,
        1.        ]])

In [23]:
print ((res))

  (0, 7600)	0.0323988290846911
  (0, 7593)	0.03437920720840217
  (0, 7554)	0.044203027822719815
  (0, 7448)	0.03197450236689372
  (0, 7426)	0.08840605564543963
  (0, 7361)	0.018483561653999895
  (0, 7351)	0.0237767742494319
  (0, 7346)	0.0475535484988638
  (0, 7345)	0.09292158970487996
  (0, 7336)	0.036263849805273675
  (0, 7332)	0.04698271231670053
  (0, 7275)	0.02613203486239744
  (0, 7223)	0.044203027822719815
  (0, 7216)	0.023632767777440975
  (0, 7134)	0.05217460288604086
  (0, 7132)	0.0496429387552095
  (0, 6932)	0.07567748003436155
  (0, 6897)	0.030823938872783996
  (0, 6886)	0.0410208839199503
  (0, 6836)	0.025384027940294318
  (0, 6808)	0.03541735507889205
  (0, 6758)	0.05436001385597321
  (0, 6753)	0.030475075549150233
  (0, 6739)	0.027407504958946446
  (0, 6727)	0.0945369205760957
  :	:
  (409, 6608)	0.23385330101976834
  (409, 6604)	0.22098214315140088
  (409, 6600)	0.13638377581341257
  (409, 5837)	0.2830062151484228
  (409, 5683)	0.12589647480227914
  (409, 5535)	0.163362

In [24]:
tfidf1 = TfidfVectorizer(stop_words='english', norm='None')
res2 = tfidf.fit_transform(article_master['stemmed_content'])

In [26]:
print(res2)

  (0, 3832)	0.044203027822719815
  (0, 3566)	0.028931267090839943
  (0, 3374)	0.019179911664006385
  (0, 3144)	0.04964293875520949
  (0, 7275)	0.02613203486239744
  (0, 1027)	0.030823938872783993
  (0, 2957)	0.022443384092761094
  (0, 741)	0.03437117802332962
  (0, 3785)	0.01909941729144975
  (0, 7448)	0.03197450236689372
  (0, 6171)	0.03876311689023013
  (0, 2681)	0.03783874001718077
  (0, 932)	0.0248663810679727
  (0, 5600)	0.01770867753944602
  (0, 4490)	0.061647877745567986
  (0, 867)	0.018192856844951297
  (0, 4868)	0.029512876277474738
  (0, 6753)	0.03047507554915023
  (0, 4922)	0.04964293875520949
  (0, 1364)	0.029216685181921574
  (0, 2862)	0.036819739454714645
  (0, 1195)	0.04102088391995029
  (0, 5265)	0.04646079485243997
  (0, 7351)	0.023776774249431896
  (0, 5433)	0.04964293875520949
  :	:
  (409, 4222)	0.16184558335516752
  (409, 5509)	0.24201053348233292
  (409, 6875)	0.17000281581773208
  (409, 1815)	0.13392035124795357
  (409, 2431)	0.16037853682552583
  (409, 4878)	0.1

In [27]:
res2

<410x7666 sparse matrix of type '<class 'numpy.float64'>'
	with 54328 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA

train_set = ["The sky is blue.", "The sun is bright."]  # Documents

vectorizer = CountVectorizer(stop_words='english')
trainVectorizerArray = vectorizer.fit_transform(train_set)
trainVectorizerArray = trainVectorizerArray.toarray()

print ('Fit Vectorizer to train set')
print(trainVectorizerArray)

transformer = TfidfTransformer()
res = transformer.fit_transform(trainVectorizerArray)
print()
print ((res.todense()))



Fit Vectorizer to train set
[[1 0 1 0]
 [0 1 0 1]]

[[0.70710678 0.         0.70710678 0.        ]
 [0.         0.70710678 0.         0.70710678]]


In [16]:
tfidf = TfidfVectorizer(stop_words='english')
res1 = tfidf.fit_transform(train_set)
print ((res1.todense()))

[[0.70710678 0.         0.70710678 0.        ]
 [0.         0.70710678 0.         0.70710678]]


In [18]:
tfidf1 = TfidfVectorizer(stop_words='english', norm='None')
res2 = tfidf.fit_transform(train_set)
print(type(res2))
print ((res2.todense()))

<class 'scipy.sparse.csr.csr_matrix'>
[[0.70710678 0.         0.70710678 0.        ]
 [0.         0.70710678 0.         0.70710678]]
