### Vectorization

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle

In [3]:
processed_df = pd.read_csv(filepath_or_buffer='preprocessed_data.csv')
processed_essay = processed_df['essay'].values

#### Bag of Words (BoW) on `essay` feature

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
count_vect = CountVectorizer(min_df=10)
text_vect_bow = count_vect.fit_transform(raw_documents=processed_essay)
print("Shape of matrix after one hot encodig: {}".format(text_vect_bow.shape))

Shape of matrix after one hot encodig: (109248, 16623)


#### TFIDF on `essay` feature

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf_vect = TfidfVectorizer(min_df=10)
text_vect_tfidf = tfidf_vect.fit_transform(raw_documents=processed_essay)
print("Shape of matrix after one hot encodig: {}".format(text_vect_tfidf.shape))

Shape of matrix after one hot encodig: (109248, 16623)


#### Average Word2Vec

In [8]:
with open(file='glove_vectors', mode='rb') as pkl:
    model = pickle.load(pkl)
    glove_words = set(model.keys())

In [9]:
word = list(glove_words)[0]
print(word)

nationwide


In [10]:
print(model[word])

[-2.7446e-01 -1.8566e-01  1.5203e-01 -3.3871e-01 -4.4707e-01  9.5133e-02
 -1.9979e+00 -1.4693e-01  1.4631e-01  6.7298e-01 -7.2874e-01  8.8087e-02
 -3.1008e-02  2.7573e-01 -1.9594e-01  1.3383e-01  1.4157e-01  1.9674e-01
 -3.6788e-02  3.5562e-01 -4.4018e-01  9.4682e-01  3.9898e-02 -2.9970e-01
 -2.0159e-01  1.5255e-01 -3.6492e-01 -5.4108e-01 -1.6126e-01 -3.4537e-01
 -5.1565e-01  1.7598e-01  3.2276e-01 -2.6215e-01 -2.5877e-01 -1.0866e-01
  2.0171e-01 -1.4057e-01  5.8433e-03 -5.2362e-02 -2.3051e-01 -5.8283e-01
  4.8124e-01 -1.9952e-01  1.4162e-01 -5.2693e-02  2.8681e-02 -1.4770e-01
 -3.2344e-01  1.8428e-01  3.2626e-01  3.5183e-01 -6.2851e-01 -4.8115e-01
 -3.1891e-01 -3.4583e-02 -5.2945e-01  1.4181e-01 -3.5899e-01  3.7177e-01
 -1.4452e-01  1.5200e-01 -4.7772e-01 -9.7155e-02 -2.5967e-01  2.3286e-02
 -5.4634e-03 -2.4719e-01  5.9037e-01  1.9936e-01 -3.5937e-01  2.2459e-01
 -1.4346e-01  1.2081e-01 -7.6962e-02  7.1359e-02  2.1582e-01  1.6143e-01
 -6.9987e-01  3.0448e-01  2.8214e-01 -6.8564e-01 -4

In [11]:
dim = len(model[word])
print(dim)

300


In [12]:
print(processed_essay[0])

i fortunate enough use fairy tale stem kits classroom well stem journals students really enjoyed i would love implement lakeshore stem kits classroom next school year provide excellent engaging stem lessons my students come variety backgrounds including language socioeconomic status many not lot experience science engineering kits give materials provide exciting opportunities students each month i try several science stem steam projects i would use kits robot help guide science instruction engaging meaningful ways i adapt kits current language arts pacing guide already teach material kits like tall tales paul bunyan johnny appleseed the following units taught next school year i implement kits magnets motion sink vs float robots i often get units not know if i teaching right way using right materials the kits give additional ideas strategies lessons prepare students science it challenging develop high quality science activities these kits give materials i need provide students science a

In [13]:
avg_w2v = list()
for essay in tqdm(processed_essay):
    vect = np.zeros(dim)
    count_ = 0
    for word in essay.split():
        if word in glove_words:
            vect += model[word]
            count_ += 1
    if count_ > 0:
        vect /= count_
    avg_w2v.append(vect)

100%|█████████████████████████████████████████████████████████| 109248/109248 [00:20<00:00, 5303.98it/s]


In [14]:
print(avg_w2v[0])

[-3.02569554e-02 -1.10436543e-02 -9.57953829e-03 -1.24331475e-01
  6.95682857e-02  1.55257491e-03 -3.10046897e+00  1.08841991e-01
  7.31905737e-02 -4.22935769e-02  8.34145869e-02 -4.59943806e-02
 -8.24955429e-03 -1.09009263e-01 -3.78188223e-02 -5.98767874e-02
 -2.19672749e-02 -2.65038063e-02  9.17395194e-02  3.04386632e-02
  4.75971257e-02  2.24326669e-02  5.81032211e-02  6.03694229e-03
 -1.20540210e-02 -1.02676334e-01  5.04888880e-02 -8.93170017e-02
 -6.95778834e-03 -1.14556890e-01 -1.95253943e-01 -1.60493758e-01
  4.03892183e-02  5.31978626e-02 -2.19084577e-02 -5.49225829e-02
 -5.40355103e-02 -7.00488914e-02  1.25479417e-02  3.41972669e-02
 -1.20129844e-01  1.37990537e-01 -7.49831257e-03 -1.02170287e-01
 -3.05570554e-02 -5.52082017e-02 -1.42282040e-02 -7.11897981e-02
 -1.48825160e-02 -1.04964546e-01 -2.25689954e-03 -6.81841623e-02
  8.46106806e-02 -3.10060206e-02  6.22965817e-02 -4.42394486e-02
  1.30125634e-02 -5.15549143e-03 -3.56335257e-02  1.28285659e-01
 -1.80263629e-02 -1.00323

In [15]:
print(len(avg_w2v))

109248


In [16]:
print(len(avg_w2v[0]))

300


In [17]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(raw_documents=processed_essay)

TfidfVectorizer()

In [18]:
print(tfidf_model.get_feature_names_out())

['00' '000' '000s' ... 'zynergy' 'zz' 'zzzzzzz']


In [19]:
print(tfidf_model.idf_)

[ 7.18528456  5.91178569 11.90823778 ... 11.50277267 11.50277267
 11.90823778]


In [20]:
dictionary = dict(zip(tfidf_model.get_feature_names_out(), tfidf_model.idf_))

In [21]:
tfidf_words = set(tfidf_model.get_feature_names_out())

In [22]:
tfidf_w2v = list()
for essay in tqdm(processed_essay):
    vect = np.zeros(dim)
    tfidf_weight = 0
    for word in essay.split():
        if (word in glove_words) and (word in tfidf_words):
            tfidf_vec = model[word]
            idf = dictionary[word]
            tf = essay.count(word) / len(essay.split())
            vect += (tfidf_vec * tf * idf)
            tfidf_weight += (tf * idf)
    if tfidf_weight > 0:
        vect /= tfidf_weight
    tfidf_w2v.append(vect)

100%|██████████████████████████████████████████████████████████| 109248/109248 [02:39<00:00, 684.57it/s]


In [23]:
print(tfidf_w2v[0])

[-2.27924638e-02 -8.69357138e-03  8.65564093e-02 -1.65780617e-01
  2.32385195e-01  7.82539979e-02 -2.57652443e+00  7.35099876e-02
  1.43973216e-01  4.43961921e-02  5.08972257e-02 -1.53119302e-01
 -1.93082123e-01 -6.47660110e-02 -4.87920257e-02 -2.81929431e-02
 -7.76188781e-02 -1.86635906e-02  1.42132297e-01  1.45003574e-01
 -2.64582574e-03  1.50895593e-01  2.40953732e-01  5.38177960e-02
 -3.14382959e-02 -1.50967760e-01 -7.39392300e-03 -1.46922200e-01
  7.66973365e-02 -2.75086000e-01 -6.60259435e-02 -4.67757588e-01
 -3.07433643e-02  3.52752287e-02 -2.68578053e-02 -1.05321659e-01
 -7.07048364e-02 -1.43525362e-01  1.33038214e-01  1.37083250e-01
 -3.09181358e-01  5.29982466e-02  4.73578161e-02  3.56331974e-02
 -8.83575899e-02 -5.22576207e-02 -1.98690783e-01 -2.04844445e-01
 -2.25694214e-02 -8.26605657e-02  8.41722533e-02 -1.24063331e-01
  1.46529801e-01  1.31369888e-02  1.03181216e-01  1.06695794e-01
  2.30884721e-02  1.09845335e-01  5.02774457e-02  2.12857030e-01
 -1.11785480e-01 -2.31516

In [24]:
print(len(tfidf_w2v))

109248


In [25]:
print(len(tfidf_w2v[0]))

300


#### `school_state` binary vectorizer

In [26]:
vect = CountVectorizer(binary=True)
school_state_ohe = vect.fit_transform(raw_documents=processed_df['school_state'].values)
print("Shape of matrix after one hot encodig: {}".format(school_state_ohe.shape))

Shape of matrix after one hot encodig: (109248, 51)


End of the file.