In [114]:
import numpy as np
import re
import scipy

In [2]:
class TF_IDF:

    def __init__(self, text_data):
        self.text_data = text_data
        self.words_cleaned = []
        self.word_dict_index = {}
        self.word_count_dict = {}
        self.IDF = {}        
        self.pattern = r"[^?.,!:; ]+"
        self.unique_words = self.clean()
        
        for i, word in enumerate(self.unique_words):
            self.word_dict_index[word] = i
            self.word_count_dict[word] = 0 
            self.IDF[word] = 0
            
        self.total_word_count()
        self.doc_matrix = np.zeros((len(self.text_data), len(self.unique_words)))
        self.tf_idf = np.zeros((len(self.text_data), len(self.unique_words)))

    
    def clean(self):
        all_text = " ".join(self.text_data)
        words = re.findall(self.pattern, all_text)
        self.words_cleaned = [word.lower() for word in words]
        return sorted(set(self.words_cleaned))
    
    def total_word_count(self):
        for word in self.words_cleaned:
            self.word_count_dict[word] += 1
        
        #print(self.word_count_dict)

    
    def tf(self):
        for i,text_doc in enumerate(self.text_data):
            text_words = re.findall(self.pattern, text_doc.lower())
            for word in text_words:
                w = self.word_dict_index[word]
                self.doc_matrix[i][w] += 1
    
    def idf(self):

        for word in self.words_cleaned:
            ttf = self.word_count_dict[word]
            D = len(self.text_data)
            idf = np.log((D / ttf))
            self.IDF[word] = idf
    
    def create_tf_idf(self):
        for i,text_doc in enumerate(self.text_data):
            text_words = re.findall(self.pattern, text_doc.lower())
            for word in text_words:
                w = self.word_dict_index[word]
                self.tf_idf[i][w] = self.IDF[word]
    
    def prepare(self):

        self.tf()
        self.idf()
        self.create_tf_idf()



In [311]:
class LSA:
    
    def __init__(self, doc_matrix, weight):
        self.weight = weight
        self.doc_matrix = doc_matrix
        U, S, V_t = np.linalg.svd(doc_matrix)
        self.U = U
        self.V = V_t.T
        self.S = self.get_eigvals()
        
        self.cutoff()
        
    def get_eigvals(self):
        XTX = np.matmul(self.doc_matrix.T, self.doc_matrix)
        l, e = np.linalg.eig(XTX)
        S = np.array(sorted(l, reverse=True))
        return S
    
    def cutoff(self):
        self.S = self.S[self.S > self.weight]
        rank = len(self.S)
        self.U = self.U[:, :rank]
        self.V = self.V[:, :rank]
        
    
    def topic_affinity(self, word_list):
        
        pass
        
    def cosine_similarity(self, vec_1, vec_2):
        vec_1_norm = np.linalg.norm(vec_1)
        vec_2_norm = np.linalg.norm(vec_2)
        return np.dot(vec_1, vec_2) / (vec_1_norm * vec_2_norm)
        
    def project(self):
        
        doc_topic = np.matmul(self.doc_matrix, self.V)
        return doc_topic
        
    

In [280]:
documents = [
    "Roses are lovely. Nobody hates roses.",
    "Gun violence has reached an epidemic proportion in America.",
    "The issue of gun violence is really over-hyped. One can find many instances of violence where no guns were involved.",
    "Guns are for violence prone people. Violence begets guns.",
    " I like guns but I hate violence. I have never been involved in violence. But I own many guns. Gun violence is incomprehensible to me. I do believe gun owners are the most anti violence people on the planet. He who never uses a gun will be prone to senseless violence.",
    "Guns were used in a armed robbery in San Francisco last night.",
    "Acts of violence usually involve a weapon.",
    "Weapon related violence is on a surge, with 1 public shooting a day in New York.",
    "Teen girl kills 7 in violent school shootout, in what is another weapon related crime",
    "Idaho senator criticizes critics of weapon violence"
]

In [281]:
tfidf = TF_IDF(documents)

In [282]:
tfidf.word_count_dict

{'1': 1,
 '7': 1,
 'a': 5,
 'acts': 1,
 'america': 1,
 'an': 1,
 'another': 1,
 'anti': 1,
 'are': 3,
 'armed': 1,
 'be': 1,
 'been': 1,
 'begets': 1,
 'believe': 1,
 'but': 2,
 'can': 1,
 'crime': 1,
 'criticizes': 1,
 'critics': 1,
 'day': 1,
 'do': 1,
 'epidemic': 1,
 'find': 1,
 'for': 1,
 'francisco': 1,
 'girl': 1,
 'gun': 5,
 'guns': 6,
 'has': 1,
 'hate': 1,
 'hates': 1,
 'have': 1,
 'he': 1,
 'i': 5,
 'idaho': 1,
 'in': 7,
 'incomprehensible': 1,
 'instances': 1,
 'involve': 1,
 'involved': 2,
 'is': 4,
 'issue': 1,
 'kills': 1,
 'last': 1,
 'like': 1,
 'lovely': 1,
 'many': 2,
 'me': 1,
 'most': 1,
 'never': 2,
 'new': 1,
 'night': 1,
 'no': 1,
 'nobody': 1,
 'of': 4,
 'on': 2,
 'one': 1,
 'over-hyped': 1,
 'own': 1,
 'owners': 1,
 'people': 2,
 'planet': 1,
 'prone': 2,
 'proportion': 1,
 'public': 1,
 'reached': 1,
 'really': 1,
 'related': 2,
 'robbery': 1,
 'roses': 2,
 'san': 1,
 'school': 1,
 'senator': 1,
 'senseless': 1,
 'shooting': 1,
 'shootout': 1,
 'surge': 1,
 '

In [283]:
tfidf.prepare()

In [284]:
X = tfidf.doc_matrix

In [285]:
X.shape

(10, 93)

In [304]:
lsa = LSA(X, 50)

In [305]:
X_proj = lsa.project()

In [251]:
terms = ["violence", "gun", "america", "roses"]
doc_term_matrix = np.array([[0, 0, 0, 2], [1, 1, 1, 0], [2, 2, 0, 0], [3, 3, 0, 0], [5, 5, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0]])