# Homework 2 - TF-IDF Classifier

Ваша цель обучить классификатор который будет находить "токсичные" комментарии и опубликовать решения на Kaggle [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

В процессе обучения нужно ответить на ***[вопросы](https://docs.google.com/forms/d/e/1FAIpQLSd9mQx8EFpSH6FhCy1M_FmISzy3lhgyyqV3TN0pmtop7slmTA/viewform?usp=sf_link)***

Данные можно скачать тут - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data



In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('fivethirtyeight')

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

Стадартными подходами для анализа текста являются [Bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model) и его модификация [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).

Они реалзованны в `sklearn` в виде [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) и [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).

Более подробней про них можно посмотреть [тут](https://github.com/udsclub/workshop/blob/master/notebooks/UDS-workshop-feature-extraction-and-engineering.ipynb)

In [33]:
def other_info(data):
#Letter count
    data['count_letters']=data["comment_text"].apply(lambda x: len(str(x)))
#Average length of the words
    data["mean_word_len"] = data["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    data['word_unique_percent']=(data["comment_text"].apply(lambda x: len(set(str(x).split()))))*100/(data["comment_text"].apply(lambda x: len(str(x).split())))
    
    data.loc[data['count_letters']>1000, 'count_letters'] = 1000
    data.loc[data['mean_word_len']>100, 'mean_word_len'] = 100
    
    smileys_good = r'((:|;|X)-?(\)|P|D))\W'
    smileys_bad =  r'((:|;)-?(\())\W'
    data['smileys_good'] = data['comment_text'].str.extract(smileys_good, expand=True)[0].fillna(0)
    data['smileys_bad'] = data['comment_text'].str.extract(smileys_bad, expand=True)[0].fillna(0)
    data['smileys_good'][data['smileys_good']!=0] = 1
    data['smileys_bad'][data['smileys_bad']!=0] = 1
    
       
    return data

In [34]:
train = other_info(train)
test = other_info(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
  out=out, **kwargs)


In [5]:
test.head()

Unnamed: 0,id,comment_text,count_letters,mean_word_len,word_unique_percent
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,367,4.111111,84.722222
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,50,3.0,91.666667
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",54,2.916667,83.333333
3,00017563c3f7919a,":If you have a look back at the source, the in...",205,4.421053,78.947368
4,00017695ad8997eb,I don't anonymously edit articles at all.,41,5.0,100.0


In [6]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,count_letters,mean_word_len,word_unique_percent
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,316.677084,4.888781,85.501925
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,294.397262,1.875908,12.817618
min,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.08
25%,0.0,0.0,0.0,0.0,0.0,0.0,96.0,4.330472,77.906977
50%,0.0,0.0,0.0,0.0,0.0,0.0,205.0,4.696429,87.5
75%,0.0,0.0,0.0,0.0,0.0,0.0,435.0,5.105263,95.652174
max,1.0,1.0,1.0,1.0,1.0,1.0,1000.0,100.0,100.0


In [7]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [None]:
from collections import Counter

In [None]:
Counter(" ".join(all_text).split()).most_common(1)

In [None]:
#text = pd.Series(' '.join(all_text).split())
#words = pd.Series(text.str.lower().str.split('\s+'))

In [8]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    min_df=4,
    ngram_range=(1,2),
    max_features=50000)

In [None]:
#word_vectorizer_cv = CountVectorizer(max_features=50000, analyzer='word', stop_words='english',ngram_range=(1,3))

In [9]:
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

Для классификации будем использовать логистическую регрессию [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
parameters = {'C': (0.1, 0.3, 0.5, 0.7, 1)}

In [None]:
classifier = LogisticRegression(random_state=17, class_weight= 'balanced') # Попробуйте разные параметры, найтдите оттимальные на кросс-валидации

In [None]:
gs_lr = GridSearchCV(classifier, parameters, n_jobs=-1, scoring ='roc_auc', verbose=1)


In [None]:
for class_name in class_names:
    train_target = train[class_name]
    
    gs_lr.fit(train_word_features,train[class_name])

    gs_score = gs_lr.best_params_
    
    print('CV score for class {} is {}'.format(class_name, gs_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

In [10]:
classifier_C = LogisticRegression(C=1, random_state=17, class_weight= 'balanced')

Будем тренировать по одному классификатору на каждый класс. 

Что бы провалидировать качество модели воспользуемся функцией [cross_val_score](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)

In [11]:
scores= []

for class_name in class_names:
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier_C, train_word_features, train_target, scoring='roc_auc'))
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9700999951598369
CV score for class severe_toxic is 0.9856283535366693
CV score for class obscene is 0.9855489461075196
CV score for class threat is 0.98233574651186
CV score for class insult is 0.9773422785912181
CV score for class identity_hate is 0.9742847166463339
Total score is 0.9792066727589064


Попробуйте подобрать лучшие параметры для `word_vectorizer` и `classifier` оптимизируя метрику [ROC AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)


### Вариант 2

In [None]:
word_vectorizer_2 = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    token_pattern=r'\w{1,}',
    stop_words='english',
    min_df=4,
    ngram_range=(2,5),
    max_features=50000)

In [None]:
word_vectorizer_2.fit(all_text)
train_word_features_2 = word_vectorizer_2.transform(train_text)
test_word_features_2 = word_vectorizer_2.transform(test_text)

In [None]:
scores= []

for class_name in class_names:
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier_C, train_word_features_2, train_target, scoring='roc_auc'))
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

### Вариант 3

In [12]:
word_vectorizer_3 = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    stop_words='english',
    min_df=4,
    ngram_range=(2,3),
    max_features=10000)

In [13]:
word_vectorizer_3.fit(all_text)
train_word_features_3 = word_vectorizer_3.transform(train_text)
test_word_features_3 = word_vectorizer_3.transform(test_text)

In [None]:
#classifier_2 = LogisticRegression(C=1, random_state=17)

In [14]:
scores= []

for class_name in class_names:
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier_C, train_word_features_3, train_target, scoring='roc_auc'))
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.7804504696023375
CV score for class severe_toxic is 0.8563472309459356
CV score for class obscene is 0.7998012898538324
CV score for class threat is 0.7811913766015661
CV score for class insult is 0.8000544626872695
CV score for class identity_hate is 0.7801227647846734
Total score is 0.7996612657459358


### Вариант 4

In [15]:
from scipy.sparse import hstack

In [16]:
train_features_two = hstack([train_word_features_3, train_word_features])
test_features_two = hstack([test_word_features_3, test_word_features])

In [17]:
scores= []

for class_name in class_names:
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier_C, train_features_two, train_target, scoring='roc_auc'))
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.9659046666045944
CV score for class severe_toxic is 0.9837634175869231
CV score for class obscene is 0.9830527903952441
CV score for class threat is 0.97893755156244
CV score for class insult is 0.9739593594191519
CV score for class identity_hate is 0.9712561119086517
Total score is 0.9761456495795008


### Вторя модель логрег

In [28]:
train.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'count_letters', 'mean_word_len',
       'word_unique_percent', 'smileys_good', 'smileys_bad'],
      dtype='object')

In [29]:
add_features = ['count_letters', 'mean_word_len', 'word_unique_percent', 'smileys_good', 'smileys_bad']

In [53]:
classifier_log = LogisticRegression(C=10, random_state=17, class_weight= 'balanced')

In [57]:
train_other = train[add_features]

In [58]:
test_ither = test[add_features]

In [62]:
test_ither = test_ither.fillna(0)

In [54]:
scores= []

for class_name in class_names:
    train_target = train[class_name]

    cv_score = np.mean(cross_val_score(classifier_log, train_other, train_target, scoring='roc_auc'))
    
    print('CV score for class {} is {}'.format(class_name, cv_score))
    scores.append(cv_score)

print('Total score is {}'.format(np.mean(scores)))

CV score for class toxic is 0.6239167550695873
CV score for class severe_toxic is 0.7113087040000794
CV score for class obscene is 0.6420780659718605
CV score for class threat is 0.696392783583883
CV score for class insult is 0.6416188095614094
CV score for class identity_hate is 0.6304301398000071
Total score is 0.6576242096644711


In [55]:
submission2 = pd.DataFrame.from_dict({'id': test['id']})

In [63]:
for class_name in class_names:
    train_target = train[class_name]
    classifier_log.fit(train_other, train_target)
    ...
    submission2[class_name] = classifier_log.predict_proba(test_ither)[:, 1]    

In [64]:
submission2.to_csv('submission_2.csv', index=False)

In [65]:
submission2.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.458662,0.355049,0.444065,0.394614,0.438345,0.442375
1,0000247867823ef7,0.59692,0.61694,0.607758,0.619074,0.609621,0.585009
2,00013b17ad220c46,0.631529,0.723695,0.65171,0.71496,0.655163,0.633776
3,00017563c3f7919a,0.570393,0.637813,0.581493,0.634573,0.583524,0.580324
4,00017695ad8997eb,0.559464,0.527451,0.56144,0.522494,0.564988,0.548044


---

Опубликуйте лучшие решение на [Kaggle Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/submit)

In [67]:
submission = pd.DataFrame.from_dict({'id': test['id']})

In [68]:
for class_name in class_names:
    train_target = train[class_name]
    classifier_C.fit(train_features_two, train_target)
    ...
    submission[class_name] = classifier_C.predict_proba(test_features_two)[:, 1]    

In [69]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.99989,0.771937,0.99991,0.248973,0.996095,0.943773
1,0000247867823ef7,0.026384,0.008228,0.009832,0.010894,0.024578,0.013407
2,00013b17ad220c46,0.098538,0.016102,0.044108,0.006907,0.069139,0.021819
3,00017563c3f7919a,0.047892,0.013774,0.00843,0.004627,0.007025,0.002899
4,00017695ad8997eb,0.112502,0.019543,0.05067,0.011498,0.142304,0.009334


In [None]:
submission.to_csv('submission.csv', index=False)

In [71]:
submission_end = submission.merge(submission2, on = "id", how = "left")

In [75]:
submission_end["toxic"] = submission_end["toxic_x"]*0.9+submission_end["toxic_y"]*0.1

In [77]:
submission_end["severe_toxic"] = submission_end["severe_toxic_x"]*0.9+submission_end["severe_toxic_y"]*0.1
submission_end["obscene"] = submission_end["obscene_x"]*0.9+submission_end["obscene_y"]*0.1
submission_end["threat"] = submission_end["threat_x"]*0.9+submission_end["threat_y"]*0.1
submission_end["insult"] = submission_end["insult_x"]*0.9+submission_end["insult_y"]*0.1
submission_end["identity_hate"] = submission_end["identity_hate_x"]*0.9+submission_end["identity_hate_y"]*0.1

In [78]:
submission_end.columns

Index(['id', 'toxic_x', 'severe_toxic_x', 'obscene_x', 'threat_x', 'insult_x',
       'identity_hate_x', 'toxic_y', 'severe_toxic_y', 'obscene_y', 'threat_y',
       'insult_y', 'identity_hate_y', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate'],
      dtype='object')

In [81]:
submission_end.drop(['toxic_x', 'severe_toxic_x', 'obscene_x', 'threat_x', 'insult_x',
       'identity_hate_x', 'toxic_y', 'severe_toxic_y', 'obscene_y', 'threat_y',
       'insult_y', 'identity_hate_y'],axis=1, inplace = True)

In [82]:
submission_end.to_csv('submission_end.csv', index=False)