In [1]:
import os
user_folder = os.path.expanduser("~")
data_folder = os.path.join(user_folder, 'E:/git/database/Toxic_Comment')
os.listdir(data_folder)

['blends',
 'sample_submission.csv',
 'sample_submission.csv.zip',
 'test.csv',
 'test.csv.zip',
 'test_features.dump',
 'test_preprocess.csv',
 'train.csv',
 'train.csv.zip',
 'train_features.dump',
 'train_preprocess.csv']

In [2]:
import numpy as np
import pandas as pd

train_df = pd.read_csv(os.path.join(data_folder, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_folder, 'test.csv'), delimiter=',')

In [6]:
import re
repl = {
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " frown ",
    ":(": " frown ",
    ":s": " frown ",
    ":-s": " frown ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

keys = [i for i in repl.keys()]


In [7]:
def filter_comments(comment, corpus):
    arr = str(comment).lower().split()

    def filter_words(word, corpus):
        if word[:4] == 'http' or word[:3]=='www':    # filter website
            return(" ".join(re.split("[:/.]", word)))
        elif word in corpus:
            return(repl[word])    # replace abbrev words with full spell
        else:
            return(word)
    arr_filter = [filter_words(i, corpus) for i in arr]
    result = " ".join(arr_filter)
    return(result)
    
train_df['comment_text_repl'] = [filter_comments(comment, keys) for comment in train_df['comment_text']]
test_df['comment_text_repl'] = [filter_comments(comment, keys) for comment in test_df['comment_text']]

In [9]:
import string
from nltk.corpus import stopwords

def text_process(comment):
    nopunc = [char for char in comment if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    result = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    result = " ".join(result)
    return result
train_df['comment_text_repl_stopwords'] = [text_process(comment) for comment in train_df['comment_text_repl']]
test_df['comment_text_repl_stopwords'] = [text_process(comment) for comment in test_df['comment_text_repl']]

In [10]:
train_df.to_csv(os.path.join(data_folder, "train_preprocess.csv"))
test_df.to_csv(os.path.join(data_folder, "test_preprocess.csv"))

### Preprocessing: TF-IDF

In [3]:
from scipy.sparse import csr_matrix,hstack 
from sklearn.externals.joblib import dump,load
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [4]:
train_df = pd.read_csv(os.path.join(data_folder, "train_preprocess.csv"))
test_df = pd.read_csv(os.path.join(data_folder, "test_preprocess.csv"))

In [5]:
?TfidfVectorizer

In [6]:
train_text = [str(comment) for comment in train_df['comment_text_repl_stopwords']]
test_text = [str(comment) for comment in test_df['comment_text_repl_stopwords']]
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=10000)
train_word_features = word_vectorizer.fit_transform(train_text)
print('Word TFIDF 1/2')
test_word_features = word_vectorizer.transform(test_text)
print('Word TFIDF 2/2')

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=10000)
train_char_features = char_vectorizer.fit_transform(train_text)
print('Char TFIDF 1/2')
test_char_features = char_vectorizer.transform(test_text)
print('Char TFIDF 2/2')


Word TFIDF 1/2
Word TFIDF 2/2
Char TFIDF 1/2
Char TFIDF 2/2


In [None]:
train_features = hstack([train_char_features, train_word_features])
print('HStack 1/2')
test_features = hstack([test_char_features, test_word_features])
print('HStack 2/2')

HStack 1/2


In [None]:
dump(train_features, os.path.join(data_folder, "train_features_10K.dump"))
dump(test_features, os.path.join(data_folder, "test_features_10K.dump"))