In [None]:
from unidecode import unidecode
import re

CONTRACTION_MAP = {
	"ain't": "is not", "aren't": "are not","can't": "cannot","can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", "so's": "so as", "that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"
}

#for example replace doesn't with do not
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    text = unidecode(text)
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [None]:
#remove special characters, extra spacing...
def remove_special(text):
    text = re.sub(r"([A-Z])([a-z])", r" \1\2", text)
    text = re.sub("[^'\"a-zA-Z0-9\.!\?\-_, ]+", "", text.lower())
    text = re.sub("[\.!\?\-_, ]+", " ", text)
    text = re.sub("[\s]+", " ", text)
    return text.strip()

In [None]:
#Tokenize and Remove Stop Words

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

#Lemmate and add pos tag to the word, and tokenize
def lemate_tokenize(text):
  word_tokens = word_tokenize(text)
  word_tagged = nltk.pos_tag(word_tokens)
  
  word_lemmatized = []
  for word, tag in word_tagged:
    if(tag[0] in ["V", "N", "J", "R"]): #keep only verbs, nouns and adjectives
      wntag = get_wordnet_pos(tag)
      if wntag is None: # not supply tag in case of None
          word_lemmatized.append(lemmatizer.lemmatize(word)+"_"+tag[0])
      else:
          word_lemmatized.append(lemmatizer.lemmatize(word, pos=wntag)+"_"+tag[0])

  # Remove stop words
  filtered_sentence = [w for w in word_lemmatized if w not in stop_words]

  return filtered_sentence

In [None]:
from nltk.tokenize import word_tokenize
import nltk
sno = nltk.stem.SnowballStemmer('english')

#Stem then tokenize
def stem_tokenize(text):
  word_tokens = word_tokenize(text)
  word_tokens = [sno.stem(w) for w in word_tokens]
  return word_tokens

In [None]:
def split_sentences(text):
    og_text = text
    try:
            # hi (hello there) you    ->    hi, hello there, you
        text = re.sub(r"\s*\(([a-z])\)", r" \1)", text)
        text = re.sub(r'\s*\((.*?)\)', r', \1,', text)
        text = re.sub(r'\s*\[(.*?)\]', r', \1,', text)
        text = re.sub(r'\s*{(.*?)}', r', \1,', text)

            # split "categories: 1) first. 2) second."
        text = re.sub(r"((^|\s+)[1-9a-zA-Z]\s*(\)|-|(\.\s)))", r" <split><conj>\1<conj><split> ", text)
        text = re.sub(r"([:;])", r" <split><conj>\1<conj><split> ", text)
            
            # split "abc -- cde", split new lines, remove extra spaces
        text = re.sub(r"(\s+-+\s+)", r" <split><conj>\1<conj><split> ", text)
        text = re.sub("[\n]+", ". ", text)
        text = re.sub("(\s?\.)+", ".", text)
        text = re.sub("[\s]+", " ", text)
            
            # split at conjunctions
        for conj in ['and', 'but', 'also', 'further', 'furthermore', 'moreover', 'so']:
            text = re.sub("(\s+"+conj+"\s+)", r" <split><conj>\1<conj><split> ", text, flags=re.I)

            # split sentence with nltk. "apndsent" that are previously added to check if sentences split between "apndsent" are worth spliting by checking if they contain a verb and a noun.
        text = text.split("<split>")
        text = [nltk.sent_tokenize(t) if "<conj>" not in t else [t] for t in text]
        text = [t for tt in text for t in tt]
        text = [re.sub("[.?!,:;]+$", "", s.strip()) for s in text]
        text = [re.sub("^[.?!,:;]+", "", s.strip()) for s in text if s != '']
        
        if len(text) == 0:
            return ""

            # stick sentences back together if they are short.
        if "<conj>" in text[0]:
            text = text[1:]
        if "<conj>" in text[-1]:
            text = text[:-1]
        if len(text) > 1:
            i=0
            add_prev = (False, ', ')
            while(i < len(text)):
                if("<conj>" in text[i]):
                    add_prev = (True, re.findall("<conj>(.*)<conj>", text[i])[0])
                    text = text[:i]+text[i+1:]
                else:
    #                 poz = nlp(text[i])
    #                 poz = [w.pos_ for w in poz]
    #                 is_sent = ("AUX" in poz or "VERB" in poz) and ("NOUN" in poz or "PRON" in poz or "PROPN" in poz)
                    is_sent = len(text[i].split(" ")) >= 5
                    
                    if not is_sent:
                        if add_prev[0] or i == len(text)-1:
                            text = text[:i-1] + [text[i-1]+ add_prev[1] +text[i]] + text[i+1:]
                        else:
                            while "<conj>" in text[i+1]:
                                add_prev = (False, re.findall("<conj>(.*)<conj>", text[i+1])[0])
                                text = text[:i+1]+text[i+2:]
                            text = text[:i] + [text[i]+ add_prev[1] +text[i+1]] + text[i+2:]
                    else:
                        i += 1
                    add_prev = (False, ', ')
                    
        text = [re.sub("[\s\.]*<conj>[\s\.]*", " ", s).strip() for s in text]
    except Exception as e:
        raise e
        print(og_text)
        print(text)
    return text

In [None]:
import re
import nltk
import math
from unidecode import unidecode

#List of words that are considered important, aka. topics (after they get stemmed)
TOPICS = ["seat", "seatbelt", "carpet", "headrest", "chair", "backrest", "backseat", "window", "engin", "gearbox", "bluetooth", "audio", "music", "cuphold", "cup", "storag", "armrest",
 "space", "luggag", "paint", "paintwork", "bumper", "wheel", "rim", "tyre", "rack", "design", "style", "shape", "aesthet", "layout", "color", "colour", "devic", "car", "vehicl",
 "motorcycl", "motorbik", "bmw", "bike", "competitor", "product", "build", "brand", "light", "sensor", "headlight", "lamp", "alarm", "wiper", "windshield", "brake", "break", "horn",
 "handbrak", "it", "dealer", "dealership", "deaelrship", "seller", "salesman", "salesperson", "salesmen", "consult", "mirror", "camera", "feature", "gadget", "configur", "model",
 "radar", "trunk", "hatch", "seri", "sporti", "countryman", "materi", "fabric", "interior", "leather", "exterior", "ambient", "usb", "aux", "port", "cd", "it", "system", "gps",
 "function", "interfac", "touchscreen", "softwar", "featur", "comput", "program", "economi", "econom", "handl", "handel"]

def split_sentences_2(text, threshold=10, non_nltk_penalty=0.75):
    text = sentence_extract_conj(text)
    return sentence_split_tree(text, TOPICS, non_nltk_penalty=non_nltk_penalty, threshold=threshold)

def sentence_extract_conj(text):
    text = unidecode(text)
        # hi (hello there) you    ->    hi, hello there, you
    text = re.sub(r"\s*\(([a-z])\)", r" \1)", text)
    text = re.sub(r'\s*\((.*?)\)', r', \1,', text)
    text = re.sub(r'\s*\[(.*?)\]', r', \1,', text)
    text = re.sub(r'\s*{(.*?)}', r', \1,', text)
        
        # split "abc -- cde", split new lines, remove extra spaces
    text = re.sub("[\n]+", ". ", text)
    text = re.sub("(\s?\.)+", ".", text)
    text = re.sub(r"(\s+-+\s+)", r" <conj>\1<conj> ", text)
        
        # split "categories: 1) first. 2) second."
    text = re.sub(r"((^|\s+)[1-9a-zA-Z]\s*(\)|-))", r" <conj>\1<conj> ", text)
    text = re.sub(r"([:;])", r" <conj>\1<conj> ", text)

        # split at conjunctions
    for conj in ['and', 'but', 'also', 'further', 'furthermore', 'moreover', 'so']:
        text = re.sub("(\s+"+conj+"\s+)", r" <conj>\1<conj> ", text, flags=re.I)

        # split sentence with nltk. "<split><conj>" that are previously added to check if sentences split between "<split><conj>" are worth spliting by checking if they contain a topic.
    text = "<conj><nltk><conj>".join(nltk.sent_tokenize(text))
    return text

def sentence_split_tree(text, topics, threshold=10, non_nltk_penalty=0.75):
    tree = sent_tree(text)
    tree.construct(topic_scorer(topics, non_nltk_penalty=non_nltk_penalty), threshold=threshold)
    text = tree.get_splits()
    text = [re.sub("[\s\.]*(<conj>)|(<nltk>)[\s\.]*", " ", s).strip() for s in text]
    text = [re.sub("[\s]+", " ", s).strip() for s in text]
    text = [re.sub("[.?!,:;]+$", "", s.strip()) for s in text]
    text = [re.sub("^[.?!,:;]+", "", s.strip()) for s in text if s != '']
    return text

class sent_tree:
    text = ""
    left_child = None
    right_child = None

    def __init__(self, text):
        self.text = text

    def construct(self, scorer, threshold=10):
        p = re.compile("<conj>((?!<conj>).)*<conj>")
        max_score = -1
        position = None
        for conj in p.finditer(self.text):
            score = scorer(self.text, conj.start(), conj.end())
            if(score > max_score):
                max_score = score
                position = (conj.start(), conj.end())

        # self.text = self.text +" "+ str(max_score)
        if(max_score > threshold):
            self.left_child = sent_tree(self.text[:position[0]])
            self.right_child = sent_tree(self.text[position[1]:])
            self.text = self.text[position[0]:position[1]]
            self.left_child.construct(scorer)
            self.right_child.construct(scorer)

    def get_splits(self):
        if(self.left_child is None and self.right_child is None):
            return [self.text]
        lpart = self.left_child.get_splits() if self.left_child is not None else []
        rpart = self.right_child.get_splits() if self.right_child is not None else []
        return lpart + rpart
    
def topic_scorer(topics, non_nltk_penalty=0.75):
    def entropy(val1, val2):
        if (val1+val2) == 0:
            return 0 
        lprob = val1 / (val1 + val2)
        rprob = val2 / (val1 + val2)
        return -(lprob*math.log(lprob, 2) if lprob!=0 else 0) - (rprob*math.log(rprob, 2) if rprob!=0 else 0)
    
    def score_func(text, start, end):
        lpart = re.sub("[\s\.]*(<conj>)|(<nltk>)[\s\.]*", " ", text[:start]).strip()
        rpart = re.sub("[\s\.]*(<conj>)|(<nltk>)[\s\.]*", " ", text[end:]).strip()
        
        lsum = sum([x in topics for x in nltk.word_tokenize(lpart)])
        rsum = sum([x in topics for x in nltk.word_tokenize(rpart)])
        entval = entropy(float(lsum), float(rsum))
        
        conj_penalty = 1 if "<nltk>" in text[start:end] else non_nltk_penalty
        conj_penalty = 0.25 if len(nltk.word_tokenize(lpart))<5 or len(nltk.word_tokenize(lpart))<5 else conj_penalty
        return entval*(lsum+rsum)*conj_penalty
    return score_func