In [None]:
import pandas as pd
train = pd.read_csv('jigsaw-toxic-comment-train.csv')
val = pd.read_csv('validation.csv')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
import re 
from string import punctuation

# делаем минимум нормализации т.к. хотим учитывать как можно больше признаков
def soft_normalize(text):
  text = re.sub("\n|\t|\s\"", " ", text)
  return text

def tokenize(text):
  text = re.sub(f"[{punctuation}]", " ", text)
  text = re.sub("\s+", " ", text)
  return text.split()

def sentenize(text):
  sents = re.split("[\n.!?]( |$)", text)
  return [s for s in sents if len(s) > 0 and s != " "]

def sents_count(text):
  return len(sentenize(text))

def mean_token_length(text):
  length = 0
  tokens = tokenize(text)
  if len(tokens) == 0:
    return 0
  for token in tokens:
    length += len(token)
  return length/len(tokens)

def caps_ratio(text):
  total = len(text)
  caps = 0
  for letter in text:
    if letter not in punctuation and letter != " ":
      if letter.lower() != letter:
        caps += 1
  return caps/total

def nonalpha_ratio(text):
  total = len(text)
  nonalpha = 0
  sents = sentenize(text)
  for sent in sents:
    for letter in text:
      if letter in punctuation:
        nonalpha += 1
  return nonalpha/total

def numeric_ratio(text):
  total = len(text)
  numeric = 0
  for letter in text:
    if letter.isnumeric():
      numeric += 1
  return numeric/total

def has_link(text):
  return int(re.search("https?://[^ ]+", text) is not None)

def longest_word(text):
  tokens = tokenize(text)
  length = 0
  for t in tokens:
    if len(t) > length:
      length = len(t)
  return length

def max_sent_complexity(text):
  compl = 0
  sents = sentenize(text)
  for s in sents:
    matches = re.findall("[,;\:\-()]+", s)
    if len(matches) > compl:
      compl = len(matches)
  return compl

def rage_punctuation_length(text):
  length = 0
  matches = re.findall("[?!.]+", text)
  for m in matches:
    if len(m) > length:
      length = len(m)
  return length

def longest_same_char(text):
  length = 0
  curr_length = 1
  for i in range(1,len(text)):
    if text[i] == text[i-1]:
      curr_length += 1
    else:
      if curr_length > length:
        length = curr_length
      curr_length = 1
  if curr_length > length:
    return curr_length
  return length 

In [None]:
train['text_soft_normalized'] = train['comment_text'].apply(soft_normalize)

train['text_mean_tokens_length'] = train['text_soft_normalized'].apply(mean_token_length)
train['text_sents_count'] = train['comment_text'].apply(sents_count)
train['text_caps_ratio'] = train['comment_text'].apply(caps_ratio)
train['text_nonalpha_ratio'] = train['text_soft_normalized'].apply(nonalpha_ratio)
train['text_numeric_ratio'] = train['comment_text'].apply(numeric_ratio)
train['text_has_link'] = train['comment_text'].apply(has_link)
train['text_longest_word'] = train['text_soft_normalized'].apply(longest_word)
train['text_complexity'] = train['text_soft_normalized'].apply(max_sent_complexity)
train['text_rage_punctuation_length'] = train['text_soft_normalized'].apply(rage_punctuation_length)
train['text_longest_same_char'] = train['text_soft_normalized'].apply(longest_same_char)

val['text_soft_normalized'] = val['comment_text'].apply(soft_normalize)

val['text_mean_tokens_length'] = val['text_soft_normalized'].apply(mean_token_length)
val['text_sents_count'] = val['comment_text'].apply(sents_count)
val['text_caps_ratio'] = val['comment_text'].apply(caps_ratio)
val['text_nonalpha_ratio'] = val['text_soft_normalized'].apply(nonalpha_ratio)
val['text_numeric_ratio'] = val['comment_text'].apply(numeric_ratio)
val['text_has_link'] = val['comment_text'].apply(has_link)
val['text_longest_word'] = val['text_soft_normalized'].apply(longest_word)
val['text_complexity'] = val['text_soft_normalized'].apply(max_sent_complexity)
val['text_rage_punctuation_length'] = val['text_soft_normalized'].apply(rage_punctuation_length)
val['text_longest_same_char'] = val['text_soft_normalized'].apply(longest_same_char)