In [1]:
import re
from typing import Iterable

import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
df_answers = pd.read_csv('../data/python/Answers.csv')

In [4]:

def clean_sentences(sentences: Iterable[str]):
    cleaned_answers_ = []
    for row in tqdm(sentences):
        answer = row.lower()
        answer = re.sub(r"<.*?>", ' ', answer)
        answer = re.sub(r"[^a-z]+", ' ', answer)
        answer = answer.strip()
        answer = answer.replace("  ", " ")
        cleaned_answers_.append(answer)
    return cleaned_answers_

In [5]:
cleaned_answers = clean_sentences(df_answers['Body'])

100%|██████████| 987122/987122 [00:30<00:00, 31975.91it/s]


In [6]:
def lemmatize(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

lemmatized_answers = [
    lemmatize(doc)
    for doc
    in tqdm(nlp.pipe(cleaned_answers[:10000], batch_size=500))
]

10000it [01:00, 164.48it/s]


In [7]:
import pickle
with open('../pkl/lemma.pkl', 'wb') as f:
    pickle.dump(lemmatized_answers, f)


In [14]:
df_clean = pd.DataFrame({"clean": lemmatized_answers})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(9961, 1)

In [15]:
vectorizer = CountVectorizer(max_features=100)
vectorized_answers = vectorizer.fit_transform(df_clean['clean'])

In [17]:
vectorized_answers.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])