In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import scipy.sparse as sp
from numpy.linalg import norm
import numpy.matlib as mb

class TFIDF(object):

    def __init__(self, corpus_title,appendix,corpus,title_grams = 3,appendix_grams = 2,corpus_grams = 1,title_weight = 10, appendix_weight = 5):        
        self.corpus = corpus
        self.corpus_title = corpus_title
        self.corpus_appendix = appendix
        

        self.title_weight = title_weight
        self.appendix_weight = appendix_weight
        
        self.title_vect = CountVectorizer(ngram_range=(1,title_grams))
        self.appen_vect = CountVectorizer(ngram_range=(1,appendix_grams))        
        self.corpus_vect = CountVectorizer(ngram_range=(1,corpus_grams)) 
        
        self.norm_corpus  = None        
        self.norm_corpus_title = None
        self.norm_corpus_appendix = None
    
    
    def __clean__(self,d):
        #Removing Stop Words
        stop_words = nltk.corpus.stopwords.words('english')
        d = re.sub(r'[^a-zA-Z0-9\s]', '', str(d), re.I|re.A)
        d = d.lower().strip()
        tks = nltk.word_tokenize(d)
        return(' '.join([t for t in tks if t not in stop_words]))
        
    def __normalize_corpus(self, title,appendix, corp):
        corp = self.__clean__(corp)
        title = self.__clean__(title)
        appendix = self.__clean__(appendix)
        
        # Title ngrams
        self.title_vect.fit([str(title)])
        titletrigrams = self.title_vect.get_feature_names() 
        
        # Appenidx ngrams
        self.appen_vect.fit([str(appendix)])
        apptrigrams = self.appen_vect.get_feature_names() 
        
        # Corpus ngrams
        self.corpus_vect.fit([str(corp)])
        corptrigrams = self.corpus_vect.get_feature_names()
        
   
        return ('|'.join(titletrigrams),'|'.join(apptrigrams),'|'.join(corptrigrams))

###### Vector Functions
    def preprocessing_text(self):
        n_c = np.vectorize(self.__normalize_corpus)
        self.norm_corpus_title,self.norm_corpus_appendix,self.norm_corpus = n_c(self.corpus_title,self.corpus_appendix,self.corpus)
        del(self.corpus_title,self.corpus_appendix,self.corpus)

    def build_corp_vect(self,title,appendix,corp):   
        document = title + '|' + appendix + '|'+ corp
        words = document.split('|')
        # words = np.lib.pad(words, ((0,self.N-len(words))), 'constant', constant_values= '')
        words = np.array(words,dtype= object)
        return(words)
    
    def n_tf_builder(self,title,appendix,corp):
        doc_words = corp.split('|')
        title_words = title.split('|')
        appendix_words = appendix.split('|')
        bowf_doc = Counter({w:1 for w in doc_words})
        bowf_doc.update({w:self.title_weight for w in title_words})
        bowf_doc.update({w:self.appendix_weight for w in appendix_words})
        bowf_doc.update(self.features_dict)
        return(bowf_doc)
######

    def build_corpus(self):
        #Corpus
        corp_vect = np.vectorize(self.build_corp_vect)
        corpus_words_array = corp_vect(self.norm_corpus_title,self.norm_corpus_appendix,self.norm_corpus)
        corpus_words = list(set(np.concatenate(corpus_words_array,axis =0)))
        features_dict = {w:0 for w in corpus_words}
        self.features_dict = features_dict
        # self.tf = pd.DataFrame([],columns = dict(sorted(self.features_dict.items(),key= lambda x:x[0])).keys())
        
    def cal_tf(self):
        self.build_corpus()
        tf_vect = np.vectorize(self.n_tf_builder)
        tf = tf_vect(self.norm_corpus_title,self.norm_corpus_appendix,self.norm_corpus)
        self.tf = pd.DataFrame(list(tf),columns = sorted(self.features_dict.keys()))
        
    def cal_df(self):
        features_names = list(self.tf.columns)
        df = np.diff(sp.csc_matrix(self.tf, copy=True).indptr)
        df = 1 + df
        self.df = df
        
    def cal_idf(self):
        N = 1 + len(self.norm_corpus)
        idf = (1.0 + np.log(float(N) / self.df)) 
        idf_d = sp.spdiags(idf, diags= 0, m=len(df), n= len(df)).todense()
        del(self.df)
        self.idf = idf
#         self.idf_d = idf_d

    def get_tfidf(self):
        self.cal_tf()
        self.cal_df()
        self.cal_idf()
        tf = np.array(self.tf, dtype='float64')
        tfidf = tf * self.idf
        norms = norm(tfidf , axis=1)
        self.tfidf = pd.DataFrame(tfidf / norms[:,None],columns=self.tf.columns)
        return (self.tfidf)
    
    
    def vectorize(self,query):
        vect = CountVectorizer(ngram_range=(1,3))
        vect.fit(query)
        vect = Counter({w:1 for w in vect.get_feature_names() if w in self.features_dict.keys()})
        vect.update(self.features_dict)
        return(np.array(list(({k:vect[k] for k in sorted(vect.keys())}).values()),dtype = 'float64'))
    
    def similarity(self,query_vector,tf_idf):
        query_matrix = mb.repmat(query_vector,tf_idf.shape[0],1)
        scores = np.sum(tf_idf*query_matrix,axis =1)/(np.linalg.norm(tf_idf,axis=1)/np.linalg.norm(query_matrix,axis=1))
        similar_scores = {i: score for i,score in enumerate(list(scores))}
        similar_scores = sorted(similar_scores.items(), key=lambda x: x[1], reverse=True)
        return(similar_scores)
    
    def update(self,New_Title,New_Appendix,New_Corpus):
        n_c = np.vectorize(self.__normalize_corpus)
        self.New_Title,self.New_Appendix,self.New_Corpus = n_c(New_Title,New_Appendix,New_Corpus)
        
        # Corpus
        corp_vect = np.vectorize(self.build_corp_vect)
        corpus_words_array = corp_vect(self.New_Title,self.New_Appendix,self.New_Corpus)
        corpus_words = list(set(np.concatenate(corpus_words_array,axis =0)))
        original_corpus = list(self.features_dict.keys())
        new_words = [w for w in corpus_words if w not in original_corpus]
        new_full_corpus = new_words + original_corpus
        self.features_dict = {w:0 for w in new_full_corpus}
        # Update_Files
        self.norm_corpus_title = list(self.norm_corpus_title) + list(self.New_Title)
        self.norm_corpus_appendix =  list(self.norm_corpus_appendix) + list(self.New_Appendix)
        self.norm_corpus = list(self.norm_corpus) + list(self.New_Corpus)
        
        
        tf_vect = np.vectorize(self.n_tf_builder)
        tf = tf_vect(self.norm_corpus_title,self.norm_corpus_appendix,self.norm_corpus)
        self.tf = pd.DataFrame(list(tf),columns = sorted(self.features_dict.keys()))
        
        self.cal_df()
        self.cal_idf()
        tf = np.array(self.tf, dtype='float64')
        tfidf = tf * self.idf
        norms = norm(tfidf , axis=1)
        self.tfidf = pd.DataFrame(tfidf / norms[:,None],columns=self.tf.columns)
        return (self.tfidf)

In [7]:
df = pd.read_csv('tfidf_test2.csv')

In [99]:
df.shape

(332, 7)

In [89]:
df_course_names = pd.read_csv('CourseNames.csv')

In [90]:
df_course_names

Unnamed: 0.1,Unnamed: 0,Name,Course Code
0,0,ABMO72 ABC Modeling Using SAS Activity Based M...,ABMO72
1,1,AML62OV SAS Anti Money Laundering Solution Ori...,AML62OV
2,2,ATEVA74C SAS Visual Analytics 7 4 An Introduct...,ATEVA74C
3,3,BNGDM17 Analytics Driven Forecasting,BNGDM17
4,4,BQBD14 Quality by Design QbD Using JMP Software,BQBD14
...,...,...,...
336,336,YBUFTV2 SAS Business Intelligence Reporting Fa...,YBUFTV2
337,337,YOLPM2 Designing Tuning and Maintaining SAS OL...,YOLPM2
338,338,YVA283ES SAS Visual Analytics 1 para SAS Viya,YVA283ES
339,339,YVAE73 Exploring Data with SAS Visual Analytics,YVAE73


In [100]:
Vectorizer = TFIDF(df['Title'],df['Appendix'],df['text'])
Vectorizer.preprocessing_text()
tf_idf = Vectorizer.get_tfidf()

In [103]:
tf_idf.shape

(332, 116918)

In [105]:
tf_idf.to_pickle("./tfidf_Complete.pkl")

import json
with open('tfidf_Complete.txt', 'w') as file:
    json.dump(Vectorizer.features_dict, file)

In [117]:
def get_similar_articles(q, df_):
  print("topic:", q)
  print("Finding Document Using Cosine Similarity: ")
  # Convert the query become a vector
  q_vec = Vectorizer.vectorize([q])
  # print(q_vec)
  sim_sorted = Vectorizer.similarity(q_vec,df_)
  # Print the articles and their similarity values
  # print(sim_sorted)
  i = 1
  for k, v in sim_sorted:
    if v != 0:
        try:
#             print(df.iloc[k,1].replace('.txt',''))
#             print(v)
            print(df.iloc[k,0].replace('.txt',''))
            # print(str(i) + ' : ' + str(list(df_course_names[df_course_names['Course Code'] == (df.iloc[k,0].replace('.txt',''))]['Course Code'])[0]))
            print(str(i) + ' : ' + str((df.iloc[k,:]['Title'])))
            print("Cosine Similarity:", v)
            print('-----------------------------------------------')
            i = i+1
            if i == 10:
                break
        except:
            continue
#       print(list_of_doc[k])
#       print()
  return(q_vec)


# Add The Query
q1 = ''
# Call the function
temp = get_similar_articles(q1, tf_idf)

topic: ARIMA
Finding Document Using Cosine Similarity: 
BNGDM17
1 : analytics driven forecasting
Cosine Similarity: 0.056280599590098214
-----------------------------------------------
LWSTSM51
2 : time series modeling essentials
Cosine Similarity: 0.054718778512563564
-----------------------------------------------
LWFETS42
3 : forecasting sas software programming approach
Cosine Similarity: 0.05026134736707092
-----------------------------------------------
LWAPIT62
4 : sas asset performance analytics industrial application internet things
Cosine Similarity: 0.04878558417982272
-----------------------------------------------
LWFSP42
5 : sas forecast server procedures
Cosine Similarity: 0.0425406073914485
-----------------------------------------------
LWHPF123
6 : sas high performance forecasting software
Cosine Similarity: 0.042125095511869035
-----------------------------------------------
LWCDS01
7 : science statistical methods
Cosine Similarity: 0.011711534791555026
-------------

In [3]:
tf_idf = pd.read_pickle("./tfidf.pkl")

In [2]:
Adv_analytics = pd.read_excel("Adv Analytics.xlsx",sheet_name = 'Sheet2')

In [3]:
Adv_analytics

Unnamed: 0,Course Code,Level,Main Category Final
0,BNGDM17,3 Intermediate,Advanced Analytics
1,BQBD14,2 Fundamental,Advanced Analytics
2,BSPV14,2 Fundamental,Advanced Analytics
3,CAAVD83,4 Expert,Advanced Analytics
4,CAAVS83,3 Intermediate,Advanced Analytics
...,...,...,...
122,RECL2,3 Intermediate,Advanced Analytics
123,RFW22,3 Intermediate,Advanced Analytics
124,SCA41,3 Intermediate,Advanced Analytics
125,SSIMDE42,1 Beginner,Advanced Analytics


In [4]:
Adv_analytics['Level'].value_counts()

3 Intermediate    67
4 Expert          43
2 Fundamental     15
1 Beginner         2
Name: Level, dtype: int64

In [8]:
def cc(text):
    return(text.split('.')[0])

def levels(text):
    return(text.split()[0])

df['Course Code'] = df['file_name'].apply(cc)

Adv_analytics = pd.merge(df,Adv_analytics,on = 'Course Code',how = 'inner')
Adv_analytics['Level'] = Adv_analytics['Level'].apply(levels)

In [11]:
Adv_Beg = Adv_analytics[(Adv_analytics['Level'] == '1') | (Adv_analytics['Level'] == '2')]

Vectorizer_beg = TFIDF(Adv_Beg['Title'],Adv_Beg['Appendix'],Adv_Beg['text'],corpus_grams = 2)
Vectorizer_beg.preprocessing_text()
tf_idf = Vectorizer_beg.get_tfidf()
import os
os.chdir('./')
os.mkdir('Adv_Beg')
os.chdir('./Adv_Beg')
tf_idf.to_pickle("./tfidf_Adv_Beg.pkl")
import json
with open('tfidf_Adv_Beg.txt', 'w') as file:
    json.dump(Vectorizer_beg.features_dict, file)
Adv_Beg.to_csv('Adv_Beg.csv')
os.chdir('./')

In [14]:
Adv_Int = Adv_analytics[(Adv_analytics['Level'] == '3')]

Vectorizer_Int = TFIDF(Adv_Int['Title'],Adv_Int['Appendix'],Adv_Int['text'],corpus_grams = 2)
Vectorizer_Int.preprocessing_text()
tf_idf = Vectorizer_Int.get_tfidf()
os.chdir('./')
os.mkdir('Adv_Int')
os.chdir('./Adv_Int')
tf_idf.to_pickle("./tfidf_Adv_Int.pkl")
with open('tfidf_Adv_Int.txt', 'w') as file:
    json.dump(Vectorizer_Int.features_dict, file)
Adv_Int.to_csv('Adv_Int.csv')

In [18]:
Adv_Exp = Adv_analytics[(Adv_analytics['Level'] == '4')]

Vectorizer_Exp = TFIDF(Adv_Exp['Title'],Adv_Exp['Appendix'],Adv_Exp['text'],corpus_grams = 2)
Vectorizer_Exp.preprocessing_text()
tf_idf = Vectorizer_Exp.get_tfidf()
os.chdir('./')
os.mkdir('Adv_Exp')
os.chdir('./Adv_Exp')
tf_idf.to_pickle("./tfidf_Adv_Exp.pkl")
with open('tfidf_Adv_Exp.txt', 'w') as file:
    json.dump(Vectorizer_Exp.features_dict, file)
Adv_Exp.to_csv('Adv_Exp.csv')

In [19]:
Vectorizer_Adv = TFIDF(Adv_analytics['Title'],Adv_analytics['Appendix'],Adv_analytics['text'],corpus_grams = 1)
Vectorizer_Adv.preprocessing_text()
tf_idf = Vectorizer_Adv.get_tfidf()
os.chdir('./')
os.mkdir('Adv_analytics')
os.chdir('./Adv_analytics')
tf_idf.to_pickle("./tfidf_Adv_analytics.pkl")
with open('tfidf_Adv_analytics.txt', 'w') as file:
    json.dump(Vectorizer_Adv.features_dict, file)
Adv_analytics.to_csv('Adv_analytics.csv')

In [169]:
def get_similar_articles(q, df_):
  print("topic:", q)
  print("Finding Document Using Cosine Similarity: ")
  # Convert the query become a vector
  q_vec = Vectorizer_beg.vectorize([q])
  # print(q_vec)
  sim_sorted = Vectorizer_beg.similarity(q_vec,df_)
  # Print the articles and their similarity values
  # print(sim_sorted)
  i = 1
  for k, v in sim_sorted:
    if v != 0:
        try:
#             print(df.iloc[k,1].replace('.txt',''))
#             print(v)
            print(df.iloc[k,0].replace('.txt',''))
            # print(str(i) + ' : ' + str(list(df_course_names[df_course_names['Course Code'] == (df.iloc[k,0].replace('.txt',''))]['Course Code'])[0]))
            print(str(i) + ' : ' + str((Adv_Beg.iloc[k,:]['Title'])))
            print("Cosine Similarity:", v)
            print('-----------------------------------------------')
            i = i+1
            if i == 10:
                break
        except:
            continue
#       print(list_of_doc[k])
#       print()
  return(q_vec)


# Add The Query
q1 = 'regression'
# Call the function
temp = get_similar_articles(q1, tf_idf)

topic: regression
Finding Document Using Cosine Similarity: 
CUJCOR2
1 : statistics anova regression logistic regression fedex test
Cosine Similarity: 0.14557379013918845
-----------------------------------------------
CDSV74
2 : statistics anova regression logistic regression
Cosine Similarity: 0.04141493184725628
-----------------------------------------------
BSPV14
3 : manipulation analytics sas university edition
Cosine Similarity: 0.023388435643858164
-----------------------------------------------
CAAVS83
4 : programming sas iml software
Cosine Similarity: 0.020515299599448283
-----------------------------------------------
ABMO72
5 : quality design qbd jmp software
Cosine Similarity: 0.019907920051878004
-----------------------------------------------
AML62OV
6 : statistics fda process validation jmp software
Cosine Similarity: 0.017573487774663537
-----------------------------------------------
BNGDM17
7 : science business user
Cosine Similarity: 0.014028647581237614
---------