In [4]:
import pandas as pd
import numpy as np

In [5]:
train_df = pd.read_csv('https://s3.amazonaws.com/ccwf-ml-data/jigsaw/train.csv')
test_df = pd.read_csv('https://s3.amazonaws.com/ccwf-ml-data/jigsaw/test.csv')

In [6]:
train_df['target'] = train_df['target'].apply(lambda x: 0 if x < 0.5 else 1)

In [8]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

STOP_WORDS = stopwords.words('english')

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(stop_words=STOP_WORDS, strip_accents='ascii', 
                             ngram_range = (1,3), max_features=60000, max_df=0.99)

In [None]:
X = vectorizer.fit_transform(train_df.comment_text)
y = train_df.target
words = np.array(vectorizer.get_feature_names())

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

transformer = SelectKBest(chi2, k=3000)
X_vecs = transformer.fit_transform(X, y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vecs, y, test_size=0.02, random_state=42)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score

svm = LinearSVC()
model = CalibratedClassifierCV(svm, cv=5)
model.fit(X_train, y_train)
p = model.predict_proba(X_test)
(roc_auc_score(y_test, p[:, 1]), np.mean(model.predict(X_test) == y_test))

In [None]:
vec = vectorizer.transform(["Gay people cannot think straight"])

In [None]:
chi_vec = transformer.transform(vec)
model.predict_proba(chi_vec)

In [None]:
vec = vectorizer.transform(["I am a gay woman, I'm black, I'm also Mexican"])

In [None]:
chi_vec = transformer.transform(vec)
model.predict_proba(chi_vec)

In [None]:
vec = vectorizer.transform(test_df.comment_text)
chi_vec = transformer.transform(vec)

In [None]:
p = model.predict_proba(chi_vec)

In [None]:
test_df['prediction'] = p[:, 1]

In [None]:
test_df.drop('comment_text', axis=1, inplace=True)

In [None]:
test_df.to_csv('submission.csv', index=False)