<a href="https://colab.research.google.com/github/myngpog/hangman/blob/main/tfidf%20word%20embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# important imports
!pip install kneed

import pandas as pd
import numpy as np
import pandas as pd
from scipy import spatial
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#import word embedding 
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip

#import csv and dropping rows without mission statements
url = 'https://raw.githubusercontent.com/p-ai-org/p-colleges/main/Brian/Mission%20Statement/Data_2-14-2023.csv'
df1 = pd.read_csv(url)
df1['Mission statement (IC2020mission)'].replace('', np.nan, inplace=True)
df1.dropna(subset=['Mission statement (IC2020mission)'], inplace=True)
len(df1)

In [None]:
#previous functions that we need
!head -n 1000 glove.42B.300d.txt > top_1000.txt

embeddings = {}
with open('top_1000.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings[word] = vector
        
words_with_embeddings = set([w for w in embeddings])

# function 1: changing text to a list of words, removing punctuation and stop words

def clean(text):
     # clean description
    text = text.translate(str.maketrans('', '', string.punctuation))
    description = text.lower().strip()
    words = description.split(" ")
    stops = set(stopwords.words('english'))
    
    # filter out stop words
    words = [w for w in words if not w in stops]

    return words

In [None]:
def word_frequency_in_all_docs(listOfWords):
  """This function calculates the term frequency (how many documents the word appears in) and returns a dictionary with each word that appears in all the documents and the number of times it appear as its corresponding key."""
  newList = list(set(listOfWords))

  # listOfWords should be: df1["Mission statement (IC2020mission)"]
  wordDic = {}
  for sentence in newList:
    cleanedSentence = set(clean(sentence))
    for word in cleanedSentence:
      # adds word in as key with value 1
      if (word not in wordDic):
        wordDic[word] = 1 
      # increase value if sees word again
      else:
        wordDic[word] += 1
    
  wordDic.pop('') 
  return wordDic

In [None]:
wordDic = word_frequency_in_all_docs(df1["Mission statement (IC2020mission)"])

In [None]:
def word_idf(dicOfWords, numberOfDocs):
  """This function takes in a dictonary of words and its frequency, calculates its IDF, and return it a dictionary with each word as the key and tf-idf values as the value."""
  # dicOfWords should be: wordDic from the word_frequency function
  
  # calculate inverse of each word and makes a new dictionary with it
  idfDic = {}
  for w in dicOfWords:
    value = dicOfWords[w]
    inverse = np.log(numberOfDocs/value)
    idfDic[w] = inverse
  
  #returns dictionary of each word and their new weighted embeddings
  print(len(idfDic))
  return idfDic

In [None]:
idf = word_idf(wordDic, 389)

In [None]:
def calculate_tfidf_weighted_embedding(listOfWords, idfDic):
  """This function calculates the word embedding of the listOfWords (mission statement) and returns the average word embedding using the weighted word average list."""
  # listOfWords should be the individual mission statements
  #idfDic should be the dictionary with each word's respective idf

  #calculate the tf (how many times the term appears in the mission statement) for mission statement
  wordDic = {}
  for words in listOfWords:
    # adds word in as key with value 1
    if (words not in wordDic):
      wordDic[words] = 1 
    # increase value if sees word again
    else:
      wordDic[words] += 1

  #calculate and return a dictionary with each word in the mission statement and their tf-idf values
  tfidfDic = {}
  for w in idfDic:
    if w in wordDic:
      idf = idfDic[w]
      tf = wordDic[w]
      tfidf = idf*tf
      tfidfDic[w] = tfidf
  
  #calculated weighted embeddings
  newList = []
  for w in listOfWords:
    #checks if w is in both newlist and tfidfdic 
    if (w in words_with_embeddings) and w in tfidfDic:
      newList += [w]
        
  if len(listOfWords) == 0:
      return None
  
  return sum([embeddings[w]*tfidfDic[w] for w in newList])/len(listOfWords)

In [None]:
mission_statement = 'Boston College was founded in 1863 by the Society of Jesus (the Jesuits) to educate Boston’s predominantly Irish, Catholic immigrant community. It opened its doors on September 5, 1864, in a building on Harrison Avenue in Boston’s South End, a small streetcar college for commuting students. When it outgrew the limitations of the space, then-president Rev. Thomas I. Gasson, S.J., bought 31 acres of the former Lawrence Farm in Chestnut Hill, Massachusetts, and broke ground in 1909 on a new campus, today fondly known as the Heights. BC began as an undergraduate liberal arts college, but as its aspirations grew, it added graduate programs and professional schools fulfilling its charter as a university.'

In [None]:
calculate_tfidf_weighted_embedding(clean(mission_statement), idf)

array([ 1.33040160e-01,  7.43449330e-02,  4.90354374e-02,  1.01876922e-01,
       -9.72299054e-02, -1.13919787e-02, -2.00068951e+00, -2.40835324e-02,
       -1.00045286e-01,  1.51011229e-01,  1.06512308e-01,  3.78893353e-02,
       -2.01741196e-02, -1.30857840e-01,  5.71837686e-02, -1.57147229e-01,
        4.30384874e-02, -3.33257169e-02,  2.95615103e-02,  1.78992637e-02,
        6.34853616e-02,  3.38694490e-02, -1.38834253e-01,  1.40734136e-01,
       -1.21012079e-02, -9.03032813e-03,  1.18442707e-01,  5.97769860e-03,
       -1.20611191e-01, -8.55118334e-02, -1.68977425e-01, -9.01536122e-02,
       -3.06323986e-04, -4.31827269e-02, -8.60500857e-02,  7.45417103e-02,
        3.16357687e-02, -5.04090730e-03, -1.16233908e-01, -1.39256492e-01,
        4.49653342e-03, -1.70729727e-01,  1.46325901e-01,  4.80137430e-02,
       -4.53421175e-02, -5.97847253e-02,  1.54624939e-01, -7.62414336e-02,
       -8.50594863e-02,  7.03085912e-04,  1.08925058e-02,  1.52528942e-01,
        7.67351538e-02,  