In [None]:
!pip install unidecode

In [10]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
df = spark.read.parquet("profiles")

In [84]:
from bs4 import BeautifulSoup
from unidecode import unidecode
import re

def clean(text):
    return re.sub(r'[^\x00-\x7F]+',' ', text)

def parse(html):
    return clean(BeautifulSoup(html, 'html.parser', from_encoding='ascii').text.strip())

In [97]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'blah'

In [98]:
rdd = (df.select("positions.summary")
    .rdd
    .map(lambda r: r.asDict()['summary'])
    .flatMap(lambda x: x).filter(lambda x: True if x else False)
    .map(parse)
    .filter(lambda x: len(x) > 300)
    .filter(lambda x: detect_language(x) == 'en'))

In [None]:
from pyspark.sql import Row
rdd.map(lambda x: Row(description=x)).toDF().write.csv("outfile3")

In [104]:
from gensim import utils
from gensim.models.word2vec import LineSentence
import itertools
from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH


class TokenSentence(object):
    """
    Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
    """

    def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None, preprocess=lambda x: utils.to_unicode(x).split()):
        """
        `source` can be either a list or single item which is either a string or a file object. Clip the file to the first
        `limit` lines (or no clipped if limit is None, the default).
        `preprocess` is a function that takes one argument, a string, and returns a list of tokens (defaults to utils.to_unicode(line).split())
        Example::
            sentences = LineSentence('myfile.txt')
        Or for compressed files::
            sentences = LineSentence('compressed_text.txt.bz2')
            sentences = LineSentence('compressed_text.txt.gz')
        """
        self.sources = source if type(source) == list else [source]
        self.max_sentence_length = max_sentence_length
        self.limit = limit
        self.preprocess = preprocess

    def __iter__(self):
        """Iterate through the lines in the source."""
        def process(source):
            for line in itertools.islice(source, self.limit):
                line = self.preprocess(line)
                i = 0
                while i < len(line):
                    yield line[i : i + self.max_sentence_length]
                    i += self.max_sentence_length

        try:
            # Assume it is a file-like object and try treating it as such
            # Things that don't have seek will trigger an exception            
            self.sources[0].seek(0)
            for s in self.sources:
                for i in process(s):
                    yield i
            
        except AttributeError:
            # If it didn't work like a file, use it as a string filename
            for s in self.sources:
                with utils.smart_open(s) as fin:
                    for i in process(fin):
                        yield i

In [119]:
import os
files = [os.path.join(f[0],i) for f in os.walk('outfile3') for i in f[2] if re.match(r"^part", i)]

In [None]:
from gensim.models.word2vec import Word2Vec

tokenize = lambda x: list(utils.tokenize(x, lower=True, deacc=True, errors='ignore'))
sentences = TokenSentence(files, preprocess = tokenize)
model = Word2Vec(sentences, size=100, window=5, min_count=10, workers=8)

In [126]:
model.save("profiles.model")
model.wv.save_word2vec_format('profiles.model.bin', binary=True)

In [145]:
model.corpus_count

4124820