In [75]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
from bson.json_util import dumps
from datetime import datetime

import re, string, unicodedata, contractions, inflect

from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem import LancasterStemmer, WordNetLemmatizer, SnowballStemmer, PorterStemmer
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from stemming.porter2 import stem

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.stats import pearsonr

from subprocess import check_output
import bson
stop = stopwords.words('english')

## reading data from the file dataset

In [76]:
# ratings = pd.read_csv("../data/processed/rating.csv")
# users   = pd.read_csv("../data/processed/users.csv")
# blogs = pd.read_csv("../data/processed/blogs.csv")

# ratings["rating"] = ratings["rating"].astype(float)

# # ratings to pivot table
# rating_piot = pd.pivot_table(ratings, values="rating", index="userId", columns="movieId")
# df_tags = pd.DataFrame(blogs.fillna("").apply(lambda x: x[3]+x[4]+x[5]+x[6], axis=1))
# df_tags.columns = ["content"]
# df_tags['raw_data'] = df_tags["content"]

## Reading data from the mongodb

In [77]:
def _connect_mongo(host='localhost', port=27017, username=None, password=None, db="Blog_Recommendation"):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]


def read_mongo(db, collection, query={}, selection=None, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query, selection)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [78]:
users = read_mongo('Blog_Recommendation',"Users", {})
blogs = read_mongo('Blog_Recommendation',"Blogs", {})
views = read_mongo('Blog_Recommendation',"Views", query={},selection={"blogid": 1, "userid" : 1, "viewedcount":1})

In [79]:

views["viewedcount"] = views["viewedcount"].astype(float)

# ratings to pivot table
rating_piot = pd.pivot_table(views, values="viewedcount", index="userid", columns="blogid")

data_blog = blogs.fillna("")[["content", "subtitle", "tags", "title"]]
df_tags = pd.DataFrame(data_blog["content"] + " " +data_blog["subtitle"] + " " +data_blog["tags"] + " " +data_blog["title"])

# df_tags = pd.DataFrame(blogs.fillna("").apply(lambda x: x[3]+x[4]+x[5]+x[6], axis=1))
df_tags.columns = ["content"]
df_tags['raw_data'] = df_tags["content"]

## Feature extraction dataset

In [None]:
# ## removal of noise and contractions

# def strip_html(text):
#     soup = BeautifulSoup(text, "html.parser")
#     return soup.get_text()

# def remove_between_square_brackets(text):
#     return re.sub('\[[^]]*\]', '', text)

# def denoise_text(text):
#     text = strip_html(text)
#     text = remove_between_square_brackets(text)
#     return text

# sample = denoise_text(sample)
# # print(sample)


# def replace_contractions(text):
#     """Replace contractions in string of text"""
#     return contractions.fix(text)

# sample = replace_contractions(sample)
# # print(sample)


def replace_noise_contraction(text):
    return contractions.fix(re.sub('\[[^]]*\]', '', BeautifulSoup(text, "html.parser").get_text()))

def 


In [46]:
# word count
df_tags['word_count'] = df_tags['content'].apply(lambda x: len(str(x).split(" ")))
# char count
df_tags['char_count'] = df_tags['content'].str.len()

# average words
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

df_tags['avg_word'] = df_tags["content"].apply(avg_word)

# stop words count
df_tags["stopwords"] = df_tags["content"].apply(lambda x: len([y for y in x.split() if y in stop]))

#hash tags
df_tags["hashtags"] = df_tags["content"].apply(lambda x: len([y for y in x.split() if y.startswith('#')]))

#numerics
df_tags["numerics"] = df_tags["content"].apply(lambda x: len([y for y in x.split() if y.isdigit()]))

#uppercase
df_tags["uppercase"] = df_tags["content"].apply(lambda x: len([y for y in x.split() if y.isupper()]))


## Preprocessing dataset

In [98]:
words = nltk.word_tokenize(sample)
# print(words)

In [99]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

words = normalize(words)
# print(words)

In [89]:
def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

stems, lemmas = stem_and_lemmatize(words)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)

Stemmed:
 ['giv', 'blood', 'yo', 'today', 'lik', 'join', 'elit', 'hyperexclud', 'club', 'pap', 'know', 'gay', 'men', 'elig', 'don', 'blood', 'stop', 'try', 'pap', 'know', 'elig', 'giv', 'blood', 'stop', 'try', 'heal', 'lgbtq', 'eq', 'gay', 'memoir', 'mean', 'don', 'blood', 'gay', 'man', 'americ', 'venezuel', 'ang', 'econom', 'ineq', 'fuel', 'ris', 'strongman', 'powerhungry', 'milit', 'act', 'interest', 'undermin', 'democr', 'institut', 'could', 'hap', 'yo', 'institut', 'protect', 'peopl', 'chos', 'pow', 'weal', 'instead', 'could', 'hap', 'yo', 'venezuel', 'govern', 'polit', 'econom', 'donaldtrump', 'cris', 'venezuel', 'tel', 'us', 'democr', 'peopl', 'thrive', 'tak', 'adv', 'every', 'chant', 'optim', 'person', 'decid', 'big', 'smal', 'on', 'peopl', 'efficy', 'mat', 'perspect', 'money', 'simpl', 'complex', 'machinelearn', 'fight', 'complex', 'bia', 'ten', 'year', 'serv', 'op', 'iraq', 'freedom', 'upd', 'is', 'lead', 'chang', 'iraq', 'mak', 'quest', 'everyth', 'try', 'mak', 'sens', 'war',

In [100]:
# Perform all the functions

In [111]:
vect = TfidfVectorizer(max_features=1200, lowercase=True, analyzer="word", stop_words="english", ngram_range=(1,1))
vect.fit_transform(stems)

<1083x579 sparse matrix of type '<class 'numpy.float64'>'
	with 1013 stored elements in Compressed Sparse Row format>

In [112]:
len(vect.get_feature_names())

579

In [106]:
len(stems)

1083

In [47]:


#transform content to lower case
df_tags["content"] = df_tags["content"].apply(lambda x: " ".join(y.lower() for y in x.split()))

# removing punctuation
df_tags["content"] = df_tags["content"].str.replace('[^\w\s]', '')

# removal of stopwords
df_tags["content"] = df_tags["content"].apply(lambda x: " ".join(y for y in x.split() if y not in stop))

#common word removal
# freq = list(pd.Series(' '.join(df_tags["content"]).split()).value_counts()[:10].index)
# df_tags["content"] = df_tags["content"].apply(lambda x: " ".join(y for y in x.split() if y not in freq))

#Rare words removal
# freq = list(pd.Series(" ".join(df_tags["content"]).split()).value_counts()[-10:].index)
# df_tags['content'] = df_tags["content"].apply(lambda x: " ".join( y for y in x.split() if y not in freq))

# spelling correction
df_tags["content"] = df_tags["content"].apply(lambda x: str(TextBlob(x).correct()))

# Tokenization
# TextBlob(df_tags["content"][0]).words

#stemming
# df_tags["content"] = df_tags["content"].apply(lambda x: " ".join([stem(y) for y in x.split()]))

#Lemmatization
df_tags["content"] = df_tags["content"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))



## Text processing with TfIdf and count vector

In [48]:
## Adv Text Processing

#N-gram N=2
# ngramVec = df_tags["content"].apply(lambda x: TextBlob(x).ngrams(2))

# Term Frequency
# tf1 = df_tags["content"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
# tf1.columns = ["words", "tf"]

# # Inverse document frequency
# for i, word in enumerate(tf1["words"]):
#     tf1.loc[i, 'idf'] = np.log(df_tags.shape[0]/(len(df_tags[df_tags["content"].str.contains(word)])))


# Term frequency and inverse document frequency
# tf1["tfidf"] = tf1["tf"]*tf1["idf"]


# Term frequency and inverse document frequency with sklearn lib
tfidf = TfidfVectorizer(max_features=450, lowercase=True, analyzer="word", stop_words="english", ngram_range=(1,1))
tfidf_vect = tfidf.fit_transform(df_tags["content"])

df_tfidf = pd.DataFrame(tfidf_vect.toarray())
df_tfidf.columns = tfidf.get_feature_names()

# merge columns with tfidf
df_tags_process = df_tags.merge(df_tfidf, left_index=True, right_index=True)

#Bag of words
# bow = CountVectorizer(max_features=300, lowercase=True, analyzer="word", ngram_range=(1,1))
# train_bow = bow.fit_transform(df_tags["content"])
# bow.get_feature_names()

#Sentiment analysis
df_tags_process["sentiment"] = df_tags["raw_data"].apply(lambda x: TextBlob(x).sentiment[0])
df_tags_process = df_tags_process.drop(["content", "raw_data"], axis=1)

## Similarity of a matrix with pearson and cosine 

In [351]:
df_similarity = df_tags.drop(["content", "raw_data"], axis=1)

## pearson correlation with pandas dataframe
df_pearsonr = df_similarity.transpose().corr(method='pearson', min_periods=1)  # [1].sort_values(ascending=False)
## cosine similarity  correlation
cosine_sim = cosine_similarity(df_similarity, df_similarity)       #pd.Series( [1]).sort_values(ascending=False)


In [302]:
# cosine_sim finding with index
cosine_values = pd.Series(cosine_sim[1]).sort_values(ascending=False)
pearson_values = df_pearsonr[1].sort_values(ascending=False)

In [325]:
for i in range(0, 42):
    print(round(cosine_values[i]-pearson_values[i], 6))

-1e-06
0.0
-1e-06
-1e-06
-1e-06
0.0
0.0
0.0
-0.0
1e-06
-1e-06
-1e-06
-0.0
0.0
1e-06
1e-06
3e-06
1e-06
0.0
2e-06
0.0
-0.0
1e-06
-0.0
-1e-06
0.0
0.0
-2e-06
-1e-06
-0.0
-0.0
0.0
1e-06
1e-06
1e-06
2e-06
-3e-06
4e-06
1e-06
-0.0
1e-06
1e-06


In [67]:
# pd.Series(cosine_similarity(df_similarity, df_similarity)[1]).sort_values(ascending=False)

# blogs
df_tags_cnt22 = lambda x: " ".join([y for y in x.split() if not y.isdigit()])

In [69]:
# len(df_tags.columns)
views
df_tags_cnt22(df_tags["content"][22])
df_tags["content"][22].split()

['It',
 'is',
 '1997.',
 'It',
 'is',
 '2017.',
 'It',
 'doesn’t',
 'matter.',
 'It',
 'is',
 'both.',
 'In',
 'years,',
 'my',
 'life',
 'has',
 'come',
 'full',
 'circle,',
 'degrees',
 'for',
 'real.',
 'At',
 '47,',
 'my',
 'life',
 'looks',
 'uncannily',
 'the',
 'same',
 'way',
 'it',
 'did',
 'at',
 '27.',
 'Post-divorce,',
 'I’ve',
 'returned',
 'to',
 'my',
 'old',
 'ways',
 'aging',
 'adulthood',
 'dating',
 'self',
 'culture',
 'My',
 'Life',
 'at',
 'Is',
 'Back',
 'to',
 'What',
 'It',
 'Was',
 'Like',
 'at']

In [74]:
# pearsonr = np.corrcoef(df_similarity)[0]
# pearsonr.sort()

# df_tags['content'][18]

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\!()-[]{};:'"\,<>./?@#$%^&*_~+')
tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')


SyntaxError: EOL while scanning string literal (<ipython-input-74-f1d04bb67996>, line 8)

In [339]:
# df_similarity.corr(method='pearson', min_periods=1)

# define punctuation
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

my_str = "Hello!!!, he said ---and went."

# To take input from the user
# my_str = input("Enter a string: ")

# remove punctuation from the string
no_punct = ""
for char in my_str:
   if char not in punctuations:
       no_punct = no_punct + char

# display the unpunctuated string
print(no_punct)



In [57]:
"90".isdigit()

True

In [None]:
blogs_db.insert_many(data_many)

In [355]:
df_tags_process = df_tags_process.drop(["content", "raw_data"], axis=1)

In [356]:
df_tags_process.columns[-10:]

Index(['workethic', 'workforce', 'workstech', 'worldkonmari', 'writer',
       'wrote', 'year', 'youprivacy', 'yoursdesign', 'sentiment'],
      dtype='object')

In [359]:
db.content_similarity.insert_many(list(df_tags_process.T.to_dict().values()))

<pymongo.results.InsertManyResult at 0x151f3c43fc8>

In [358]:
db.content_similarity

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'Blog_Recommendation'), 'content_similarity')

In [368]:
bson.son.SON({})

SON([])

In [49]:
tfidf.get_feature_names()

['1000',
 '1997',
 '20',
 '2016',
 '2017',
 '2018',
 '2019',
 '25',
 '27',
 '360',
 '47',
 '780000',
 'adulthood',
 'advance',
 'advantage',
 'advertising',
 'advice',
 'aesthetic',
 'afford',
 'ago',
 'anger',
 'apple',
 'art',
 'author',
 'available',
 'avoid',
 'awkward',
 'axillary',
 'bad',
 'barrier',
 'base',
 'basic',
 'basicincome',
 'believe',
 'better',
 'bias',
 'big',
 'bland',
 'blood',
 'blossom',
 'bowl',
 'build',
 'business',
 'canon',
 'care',
 'career',
 'cash',
 'chance',
 'cigarette',
 'commerce',
 'communicating',
 'communication',
 'community',
 'company',
 'complex',
 'complexity',
 'composer',
 'compromise',
 'concept',
 'confidencebuilding',
 'confirmation',
 'conservative',
 'conspiracy',
 'conspiracytheories',
 'control',
 'conversation',
 'copying',
 'counterpart',
 'couture',
 'covering',
 'cowardly',
 'crazy',
 'creating',
 'creative',
 'creativity',
 'creator',
 'credit',
 'crisis',
 'criticizing',
 'crush',
 'cultural',
 'culture',
 'customer',
 'cysti

In [16]:
tfidf.get_feature_names()

['1000',
 '1cb6b5a58072companies',
 '1e11a13afbecif',
 '1e11a13afbecpostdivorce',
 '255075e61b5f2019',
 '27',
 '2879ca55026aefficiency',
 '29927d35d476how',
 '2abb3ee3a489when',
 '324326475b88and',
 '36bb350e8f70better',
 '36bb350e8f70credit',
 '3a2108752087he',
 '41e48b43404fif',
 '441cd3efcd54how',
 '47',
 '486aefe3d462the',
 '5254bd738dcdecigarettes',
 '56ad0b00a091the',
 '64c49bd92b9ethe',
 '64dc0564597athe',
 '6dedc06fe4e4what',
 '73720195c226vonnegut',
 '78e429aad85ait',
 '799bdc99a0fdencouragement',
 '7bcec4b8c02ea',
 '7bcec4b8c02eget',
 '7bcec4b8c02ei',
 '7da2a328fd85great',
 '88325be720b4lets',
 '890b995a50b5people',
 '96d8c816c5dbcustomers',
 '9cca1cc5944fheres',
 'a8634106cb64on',
 'add55d826321sometimes',
 'addictionexperts',
 'adsadvertising',
 'adulthood',
 'advancetechnology',
 'advertising',
 'advice',
 'aesthetic',
 'afc1ff39833eembarrassing',
 'afford',
 'affordability',
 'age',
 'agriculture',
 'america',
 'analysis',
 'androidyour',
 'anger',
 'anniversary',
 'anxie

In [27]:
df_tags

Unnamed: 0,content,raw_data,word_count,char_count,avg_word,stopwords,hashtags,numerics,uppercase
0,a8634106cb64on paper know in eligible give blo...,"a8634106cb64On paper, I know why I’m not eligi...",32,185,4.470588,12,0,0,1
1,64dc0564597athe institution protected people c...,64dc0564597aThe institutions that should have ...,31,234,6.580645,10,0,0,0
2,2879ca55026aefficiency matter perspectivemoney...,2879ca55026aEfficiency is a matter of perspect...,14,129,8.285714,4,0,0,0
3,73720195c226vonnegut dresden fallujahveterans ...,"73720195c226Vonnegut had Dresden, and I had Fa...",17,118,6.0,5,0,0,1
4,78e429aad85ait cash netflix audience idea stil...,"78e429aad85aIt has the cash, and Netflix has t...",25,168,5.76,7,0,0,0
5,9cca1cc5944fheres new smartphone teach workste...,9cca1cc5944fHere’s how the new smartphone tech...,20,136,5.85,5,0,0,0
6,96d8c816c5dbcustomers willing pay upwards 1000...,96d8c816c5dbCustomers are willing to pay upwar...,29,221,6.16129,11,0,0,0
7,486aefe3d462the untold origin story ironic loo...,486aefe3d462The untold origin story of an icon...,28,182,5.535714,7,0,0,0
8,b42354b051f1a small group welleducated profess...,b42354b051f1A small group of well-educated pro...,31,225,6.290323,9,0,0,1
9,7da2a328fd85great news creator follow step fin...,7da2a328fd85Great news! You can be a creator. ...,27,203,6.555556,7,0,0,0


In [26]:
 blogs["tags"][0] + blogs["content"] + blogs["title"] + blogs["subtitle"]

0     health lgbtq equality gay memoirGiving blood i...
1     health lgbtq equality gay memoirIn Venezuela, ...
2     health lgbtq equality gay memoirSome people th...
3     health lgbtq equality gay memoirTen years afte...
4                                                   NaN
5                                                   NaN
6     health lgbtq equality gay memoirCustomers are ...
7     health lgbtq equality gay memoirA chance meeti...
8                                                   NaN
9                                                   NaN
10    health lgbtq equality gay memoirThe author of ...
11    health lgbtq equality gay memoirEveryone seems...
12                                                  NaN
13    health lgbtq equality gay memoirSociety is lea...
14                                                  NaN
15                                                  NaN
16                                                  NaN
17                                              

In [22]:
blogs.fillna("")

Unnamed: 0,blogid,content,createrdate,createrid,subtitle,tags,title
0,2524c43bb922,Giving blood in the U.S. today is like joining...,1544250000000.0,a8634106cb64,"On paper, I know why I’m not eligible to give ...",health lgbtq equality gay memoir,What it Means to Donate Blood as a Gay Man in ...
1,e11c38f98977,"In Venezuela, anger over economic inequality f...",1548510000000.0,64dc0564597a,The institutions that should have protected th...,venezuela government politics economics donald...,What the Crisis in Venezuela Tells Us About De...
2,6052b0cc43ac,Some people thrive on taking advantage of ever...,1539530000000.0,2879ca55026a,Efficiency is a matter of perspective,money simplicity complexity machine-learning,How to Fight Against Complexity Bias
3,daaaaff085c,Ten years after I served in Operation Iraqi Fr...,1533730000000.0,73720195c226,"Vonnegut had Dresden, and I had Fallujah",veterans war life iraq kurt-vonnegut,Making Sense of My Time at War
4,f52733f4666,,1549490000000.0,78e429aad85a,"It has the cash, and Netflix has the audience....",netflix apple predictions business analysis,8 Reasons Why Apple Won’t Buy Netflix
5,bc30074fb93d,,1549470000000.0,9cca1cc5944f,Here’s how the new smartphone tech works,tech huawei smartphones cameras android,Your Next Phone May Have a Hole in the Screen
6,66771889da10,"Customers are willing to pay upwards of $1,000...",1548930000000.0,96d8c816c5db,"Customers are willing to pay upwards of $1,000...",apple business psychology economics product-ma...,The Simple Psychology Behind Apple’s Fall
7,ebbcb4723c0e,A chance meeting between a man with idle machi...,1548890000000.0,486aefe3d462,The untold origin story of an iconic workout m...,fitness oprah exercise history marketing,From Oil to Oprah: An Oral History of the Stai...
8,b5b8c7911819,,1549420000000.0,b42354b051f1,A small group of well-educated professionals e...,technology economics work inequality,Tech Is Splitting the U.S. Workforce in Two
9,ecfca5ab7518,,1549310000000.0,7da2a328fd85,Great news! You can be a creator. Follow these...,creativity productivity self how-to,How to Be Creative
