# Homework 2 - TF-IDF Classifier

Ваша цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

В процессе обучения нужно ответить на ***[вопросы](https://docs.google.com/forms/d/e/1FAIpQLSd9mQx8EFpSH6FhCy1M_FmISzy3lhgyyqV3TN0pmtop7slmTA/viewform?usp=sf_link)***

Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('D:/STUDY/Data_Science_Study/University_course/HW_2/train.csv').fillna(' ')
test = pd.read_csv('D:/STUDY/Data_Science_Study/University_course/HW_2/test.csv').fillna(' ')

Стадартными подходами для анализа текста являются [Bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model) и его модификация [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).

Они реалзованны в `sklearn` в виде [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) и [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).

Более подробней про них можно посмотреть [тут](https://github.com/udsclub/workshop/blob/master/notebooks/UDS-workshop-feature-extraction-and-engineering.ipynb)

In [118]:
train_text = train['comment_text']
test_text = test['comment_text']
#all_text = pd.concat([train_text, test_text])

In [131]:
# Попробуйте разные Vectorizer и разные размеры n-gramm, стоп-слова, обрезку редких слов, обрезку слишком частых слов
#word_vectorizer = TfidfVectorizer() # TfidfVectorizer или CountVectorizer

# Data Cleaning

In [4]:
import re

In [5]:
train_text.shape

(159571,)

In [6]:
len(train_text)

159571

In [7]:
test_text.shape

(153164,)

In [8]:
len(test_text)

153164

In [119]:
def cleaning_data(noizy_comment):
    noizy_comment= re.sub(r'http\S+', '', noizy_comment)
    noizy_comment = noizy_comment.lower()
    noizy_comment = re.sub(r"what's", "what is ", noizy_comment)
    noizy_comment = re.sub(r"\'s", " ", noizy_comment)
    noizy_comment = re.sub(r"\'ve", " have ", noizy_comment)
    noizy_comment = re.sub(r"can't", "cannot ", noizy_comment)
    noizy_comment = re.sub(r"n't", " not ", noizy_comment)
    noizy_comment = re.sub(r"i'm", "i am ", noizy_comment)
    noizy_comment = re.sub(r"\'re", " are ", noizy_comment)
    noizy_comment = re.sub(r"\'d", " would ", noizy_comment)
    noizy_comment = re.sub(r"\'ll", " will ", noizy_comment)
    noizy_comment = re.sub(r"\'scuse", " excuse ", noizy_comment)
    noizy_comment = re.sub(r'\W', ' ', noizy_comment)
    noizy_comment = re.sub(r'\s+', ' ', noizy_comment)
    noizy_comment = re.sub(' +',' ',noizy_comment)
    noizy_comment = re.sub(r'\n','',noizy_comment)
    noizy_comment = noizy_comment.strip(' ')
    return noizy_comment

In [120]:
def data_set_cleaning(noizy_data_set):
    cleaned_text = []
    for i in range(0,len(noizy_data_set)):
        text_cleaning = cleaning_data(noizy_data_set[i])
        cleaned_text.append(text_cleaning)
    noizy_data_set = pd.Series(cleaned_text).astype(str)
    return noizy_data_set

In [121]:
train_text = data_set_cleaning(train_text)

In [122]:
train_text.head()

0    explanation why the edits made under my userna...
1    d aww he matches this background colour i am s...
2    hey man i am really not trying to edit war it ...
3    more i cannot make any real suggestions on imp...
4    you sir are my hero any chance you remember wh...
dtype: object

In [123]:
test_text = data_set_cleaning(test_text)

In [124]:
test_text.head()

0    yo bitch ja rule is more succesful then you wi...
1              from rfc the title is fine as it is imo
2                       sources zawe ashton on lapland
3    if you have a look back at the source the info...
4            i do not anonymously edit articles at all
dtype: object

In [125]:
test_text[1]

'from rfc the title is fine as it is imo'

In [126]:
all_text = pd.concat([train_text, test_text])

In [127]:
all_text.head()

0    explanation why the edits made under my userna...
1    d aww he matches this background colour i am s...
2    hey man i am really not trying to edit war it ...
3    more i cannot make any real suggestions on imp...
4    you sir are my hero any chance you remember wh...
dtype: object

In [28]:
#all_text_pop = all_text.apply(pd.Series)#after data clean

In [109]:
from sklearn.feature_extraction import stop_words

In [110]:
stopWords = stop_words.ENGLISH_STOP_WORDS

# Best start

In [132]:
# Попробуйте разные Vectorizer и разные размеры n-gramm, стоп-слова, обрезку редких слов, обрезку слишком частых слов
word_vectorizer_toxic = TfidfVectorizer(decode_error = 'ignore',min_df=10, max_df=0.5)
word_vectorizer_severe_toxic = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
#word_vectorizer_obscene = TfidfVectorizer(stop_words=stopWords)
#word_vectorizer_threat = TfidfVectorizer(ngram_range=(1,2))
#word_vectorizer_insult = TfidfVectorizer(ngram_range=(1,2))
word_vectorizer_threat_insult = TfidfVectorizer(ngram_range=(1,2))
#word_vectorizer_identity_hate = TfidfVectorizer(stop_words=stopWords)
word_vectorizer_obscene_identity_hate = TfidfVectorizer(stop_words='english')
# TfidfVectorizer или CountVectorizer

In [134]:
#classifier = LogisticRegression(...) # Попробуйте разные параметры, найтдите оттимальные на кросс-валидации
classifier_toxic = LogisticRegression(C=4.1,intercept_scaling=2.1,solver="lbfgs")#ok lbfgs
#classifier_toxic = LogisticRegression(C=8,intercept_scaling=2.1,solver="lbfgs")#ok lbfgs
classifier_severe_toxic  = LogisticRegression(C=1.1,intercept_scaling=1.9,class_weight="balanced",solver="lbfgs")#ok solver="lbfgs"
#classifier_severe_toxic  = LogisticRegression(C=2,intercept_scaling=1.9,class_weight="balanced",solver="lbfgs")#ok solver="lbfgs"
classifier_obscene = LogisticRegression(C=2,intercept_scaling=1.9,class_weight="balanced",solver="lbfgs")#1.9 lbfgs
classifier_threat = LogisticRegression(C=50,intercept_scaling=1.0,solver="lbfgs")#1 solver="lbfgs"
#classifier_threat = LogisticRegression(C=25,intercept_scaling=1.0,solver="lbfgs")#1 solver="lbfgs"
classifier_insult = LogisticRegression(C=16,intercept_scaling=1.5,class_weight="balanced",solver="lbfgs")#1.5 lbfgs
#classifier_insult = LogisticRegression(C=32,intercept_scaling=1.5,class_weight="balanced",solver="lbfgs")#1.5 lbfgs
classifier_identity_hate = LogisticRegression(C=1.6,intercept_scaling=1.0,solver="newton-cg")#1 newton-cg

In [135]:
scores= []

for class_name in class_names:
    train_target = train[class_name]
    if class_name == "toxic":
        cv_score = np.mean(cross_val_score(classifier_toxic, train_word_features_toxic, train_target, scoring='roc_auc'))
        print('CV score for class {} is {}'.format(class_name, cv_score))
        scores.append(cv_score)
    elif class_name == "severe_toxic":
        cv_score = np.mean(cross_val_score(classifier_severe_toxic, train_word_features_severe_toxic, train_target, scoring='roc_auc'))
        print('CV score for class {} is {}'.format(class_name, cv_score))
        scores.append(cv_score)
    elif class_name == "obscene":
        cv_score = np.mean(cross_val_score(classifier_obscene, train_word_features_obscene_identity_hate, train_target, scoring='roc_auc'))
        print('CV score for class {} is {}'.format(class_name, cv_score))
        scores.append(cv_score)
    elif class_name == "threat":
        cv_score = np.mean(cross_val_score(classifier_threat, train_word_features_threat_insult, train_target, scoring='roc_auc'))
        print('CV score for class {} is {}'.format(class_name, cv_score))
        scores.append(cv_score)
    elif class_name == "insult":
        cv_score = np.mean(cross_val_score(classifier_insult, train_word_features_threat_insult, train_target, scoring='roc_auc'))
        print('CV score for class {} is {}'.format(class_name, cv_score))
        scores.append(cv_score)
    elif class_name == "identity_hate":
        cv_score = np.mean(cross_val_score(classifier_identity_hate, train_word_features_obscene_identity_hate, train_target, scoring='roc_auc'))
        print('CV score for class {} is {}'.format(class_name, cv_score))
        scores.append(cv_score)  

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9724445428935368
CV score for class severe_toxic is 0.9854993012951664
CV score for class obscene is 0.9846522619638045
CV score for class threat is 0.9886367955264644
CV score for class insult is 0.976630682361082
CV score for class identity_hate is 0.9739769604916245
Total score is 0.9803067574219465


# Best End

Для классификации будем использовать логистическую регрессию [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

In [8]:
classifier = LogisticRegression() # Попробуйте разные параметры, найтдите оттимальные на кросс-валидации


Будем тренировать по одному классификатору на каждый класс. 

Что бы провалидировать качество модели воспользуемся функцией [cross_val_score](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)

In [7]:
#default
scores= []

for class_name in class_names:
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, scoring='roc_auc'))
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9685394913528859
CV score for class severe_toxic is 0.9831972615849033
CV score for class obscene is 0.981616766505934
CV score for class threat is 0.9822578771961118
CV score for class insult is 0.9741355527269565
CV score for class identity_hate is 0.9713648011395858
Total score is 0.9768519584177295


Попробуйте подобрать лучшие параметры для `word_vectorizer` и `classifier` оптимизируя метрику [ROC AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)


---

Опубликуйте лучшие решение на [Kaggle Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/submit)

In [115]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [None]:
for class_name in class_names:
    .....
    classifier.fit(...)
    ...
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]    

In [136]:
for class_name in class_names:
    train_target = train[class_name]
    if class_name == "toxic":
        x_test = train_word_features_toxic[:]
        y_test = train_target[:]
        print("Ok_1",class_name)
        classifier_toxic.fit(x_test, y_test)
        print("Ok_2",class_name)
        submission[class_name] = classifier_toxic.predict_proba(test_word_features_toxic)[:, 1] 
        print("Ok_3",class_name)
    elif class_name == "severe_toxic":
        x_test = train_word_features_severe_toxic[:]
        y_test = train_target[:]
        print("Ok_1",class_name)
        classifier_severe_toxic.fit(x_test, y_test)
        print("Ok_1",class_name)
        submission[class_name] = classifier_severe_toxic.predict_proba(test_word_features_severe_toxic)[:, 1] 
        print("Ok_1",class_name)
    elif class_name == "obscene":
        x_test = train_word_features_obscene_identity_hate[:]
        y_test = train_target[:]
        print("Ok_1",class_name)
        classifier_obscene.fit(x_test, y_test)
        print("Ok_1",class_name)
        submission[class_name] = classifier_obscene.predict_proba(test_word_features_obscene_identity_hate)[:, 1] 
        print("Ok_1",class_name)
    elif class_name == "threat":
        x_test = train_word_features_threat_insult[:]
        y_test = train_target[:]
        print("Ok_1",class_name)
        classifier_threat.fit(x_test, y_test)
        print("Ok_1",class_name)
        submission[class_name] = classifier_threat.predict_proba(test_word_features_threat_insult)[:, 1] 
        print("Ok_1",class_name)
    elif class_name == "insult":
        x_test = train_word_features_threat_insult[:]
        y_test = train_target[:]
        print("Ok_1",class_name)
        classifier_insult.fit(x_test, y_test)
        print("Ok_1",class_name)
        submission[class_name] = classifier_insult.predict_proba(test_word_features_threat_insult)[:, 1] 
        print("Ok_1",class_name)
    elif class_name == "identity_hate":
        x_test = train_word_features_obscene_identity_hate[:]
        y_test = train_target[:]
        print("Ok_1",class_name)
        classifier_identity_hate.fit(x_test, y_test)
        print("Ok_1",class_name)
        submission[class_name] = classifier_identity_hate.predict_proba(test_word_features_obscene_identity_hate)[:, 1] 
        print("Ok_1",class_name)

Ok_1 toxic
Ok_2 toxic
Ok_3 toxic
Ok_1 severe_toxic
Ok_1 severe_toxic
Ok_1 severe_toxic
Ok_1 obscene
Ok_1 obscene
Ok_1 obscene
Ok_1 threat
Ok_1 threat
Ok_1 threat
Ok_1 insult
Ok_1 insult
Ok_1 insult
Ok_1 identity_hate
Ok_1 identity_hate
Ok_1 identity_hate


In [None]:
for class_name in class_names:
    .....
    classifier.fit(...)
    ...
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]    

In [137]:
submission.to_csv('D:/STUDY/Data_Science_Study/University_course/HW_2/submission.csv', index=False)