# 1. Preparation

In [12]:
# import all necessary libraries
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer 
import re  
from tqdm import notebook 
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/armit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/armit/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# get data
comm = pd.read_csv('toxic_comments.csv')

In [3]:
comm.head(5)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [4]:
comm.tail(5)

Unnamed: 0,text,toxic
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0
159570,"""\nAnd ... I really don't think you understand...",0


In [5]:
# check for empty values
comm\
    .isna()\
    .sum()\
    .to_frame()\
    .assign(perc = lambda row: 100 * row[0] / comm.shape[0])\
    .rename(columns={0: '# of empty values', 'perc': '% of empty values'})\
    .sort_values('% of empty values', ascending = False)

Unnamed: 0,# of empty values,% of empty values
text,0,0.0
toxic,0,0.0


In [6]:
# check for duplicates
print('== Duplicates ==')
print('')
#test.duplicated().sum()

duplicates_comm = comm[comm.duplicated() == True]

print(len(duplicates_comm))

== Duplicates ==

0


In [7]:
comm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
text     159571 non-null object
toxic    159571 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [7]:
# apply appropriate data type for text colums . Necessary for the upcoming steps
corpus = comm['text'].values.astype('U')

# Cleaning

In [8]:
# use regular expression to clean text from garbage
def clear_text(text):
    text1 = re.sub(r'[^a-zA-Z?!)(]', ' ', text) 
    return " ".join(text1.split())

In [9]:
for i in range(len(corpus)):
    corpus[i] = clear_text(corpus[i])

# Lemmatization

In [10]:
# initializing lemmatizier and creating a fuction for finding lemms of text
lemmatizer = WordNetLemmatizer()
def nltk_lemm(text):
    for i in range(len(text)):
        word_list = nltk.word_tokenize(text[i])
        lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
        return lemmatized_output

In [13]:
#applying lemm fuction
nltk_lemm(corpus)

'Explanation Why the edits made under my username Hardcore Metallica Fan were reverted ? They weren t vandalism just closure on some GAs after I voted at New York Dolls FAC And please don t remove the template from the talk page since I m retired now'

In [14]:
# make a deep copy of original data frame so o be on a safe side 
comm_2 = comm.copy(deep= True)

In [15]:
# adding cleaned and lemmatied text to the copy 
comm_2['ready_text'] = pd.Series(corpus)

In [16]:
# check that operations were successful 
comm_2

Unnamed: 0,text,toxic,ready_text
0,Explanation\nWhy the edits made under my usern...,0,Explanation Why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,0,D aww! He matches this background colour I m s...
2,"Hey man, I'm really not trying to edit war. It...",0,Hey man I m really not trying to edit war It s...
3,"""\nMore\nI can't make any real suggestions on ...",0,More I can t make any real suggestions on impr...
4,"You, sir, are my hero. Any chance you remember...",0,You sir are my hero Any chance you remember wh...
...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,And for the second time of asking when your vi...
159567,You should be ashamed of yourself \n\nThat is ...,0,You should be ashamed of yourself That is a ho...
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,Spitzer Umm theres no actual article for prost...
159569,And it looks like it was actually you who put ...,0,And it looks like it was actually you who put ...


In [17]:
# declaring features and target

features = comm_2.drop(['text','toxic'], axis=1)
target = comm_2['toxic']

In [18]:
# split dataset into train and test
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.20, random_state=42)

In [19]:
#declaring words that are meaningless a.k.a stopwords from nltk
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))

# preparing TF-IDF  vector with stopwords, so that the TF-IDF  counter will not take stopwords into consideration
count_tf_idf = TfidfVectorizer(stop_words=stopwords)

[nltk_data] Downloading package stopwords to /home/armit/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
# fitting TF-IDF counter (model) with parts parts of our text
count_tf_idf.fit(features_train['ready_text']) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [21]:
# applying TF-IDF counter (model) to the data
tf_idf_train = count_tf_idf.transform(features_train['ready_text'])
tf_idf_test = count_tf_idf.transform(features_test['ready_text'])

print("Matrix train:", tf_idf_train.shape)
print("Matrix test:", tf_idf_test.shape)

Matrix train: (127656, 148071)
Matrix test: (31915, 148071)


###  Summary:
1. necessary packages imported
2. data loaded
3. check duplicates = 0
4. No empty values
5. Visial inspection - noise detected, i.e simbols that does not have meaning 
6. Noise cleaned with regular expression
7. Lemmatization performed
8. Applied counter TF-IDF to create feature matrix out of texts 
9. Prepared features and target

# 2. Training

In [24]:
# defining list for storing results  
models_results=[]

## DecisionTree

In [25]:
# using DecisionTree model 
for depth in range(1,11):

    model_tree = DecisionTreeClassifier(max_depth=depth, random_state=12345)
    model_tree.fit(tf_idf_train, target_train)
    
    predicted = model_tree.predict(tf_idf_test)
    f1 = f1_score(target_test, predicted, average='micro', labels=np.unique(predicted)) 
    
    models_results.append(['model_tree', 'Depth', depth ,f1])

## RandomForestClassifier


In [26]:
# using RandomForest model 
for estim in range(10, 111, 30):
    for depth in range(1,11,2):

        model_forest = RandomForestClassifier(n_estimators=estim, max_depth=depth, random_state=12345)
        model_forest.fit(tf_idf_train, target_train)
        predicted = model_forest.predict(tf_idf_test)
        f1 = f1_score(target_test, predicted, average='micro', labels=np.unique(predicted))      
        
        models_results.append(['model_forest', 'Estimators/depth', str(estim)+'/'+str(depth), f1])

## LogisticRegression

In [None]:
# lastrly using LogisticRegression model 
solver='liblinear'
for itr in range(100,1251,250):
    model_regres = LogisticRegression(random_state=12345,solver=solver,penalty='l1',max_iter=itr)
    model_regres.fit(tf_idf_train, target_train)
            
    predicted = model_regres.predict(tf_idf_test)
    f1 = f1_score(target_test, predicted,average='micro', labels=np.unique(predicted))    
            
    models_results.append(['model_regres', 'Solver/Max_iter', str(solver)+'/'+str(itr),f1])

### Summary:
1. Trained 3 models with different hyperparams
2. Calculated f1,with option ='micro', i.e without makeing favour to target (0/1)
3. All performed results saved in table 

# 3. Results

In [28]:
# creating final dataframe with results 
column_names = ['model', 'hyper_param', 'hyper_param_value', 'f1_score']

df_results = pd.DataFrame(models_results, columns=column_names)

In [29]:
df_results

Unnamed: 0,model,hyper_param,hyper_param_value,f1_score
0,model_tree,Depth,1,0.913928
1,model_tree,Depth,2,0.920915
2,model_tree,Depth,3,0.924644
3,model_tree,Depth,4,0.92762
4,model_tree,Depth,5,0.930378
5,model_tree,Depth,6,0.932101
6,model_tree,Depth,7,0.934294
7,model_tree,Depth,8,0.93561
8,model_tree,Depth,9,0.93724
9,model_tree,Depth,10,0.938587


In [30]:
#Top 3 results
display(df_results.sort_values(by='f1_score', ascending=False).head(3))

Unnamed: 0,model,hyper_param,hyper_param_value,f1_score
15,model_forest,Estimators/depth,40/1,0.946456
16,model_forest,Estimators/depth,40/3,0.946456
28,model_forest,Estimators/depth,100/7,0.946456


In [31]:
#3 outsiders
display(df_results.sort_values(by='f1_score', ascending=True).head(3))

Unnamed: 0,model,hyper_param,hyper_param_value,f1_score
0,model_tree,Depth,1,0.913928
1,model_tree,Depth,2,0.920915
2,model_tree,Depth,3,0.924644


# Final summary

1. Applied 3 models: Decesion treee, Random Forest, Logistic Regression
2. All got metric higher than trashhold 
4. Top model: Logistic Regression с f1= 0.960457
5. Worst model: Decesion treee с f1 = 0.913928


### Note TBD:
Research if feature "comment length" can improve metrics. 

## Below are experimental cells 

In [22]:
print('f1 macro: ', f1_score(target_test, predict, average='macro'))

print('f1 micro: ', f1_score(target_test, predict, average='micro'))

print('f1 weighted: ', f1_score(target_test, predict, average='weighted'))

print('f1 None: ', f1_score(target_test, predict, average=None))

f1 macro:  0.8566845045529252
f1 micro:  0.955851480495065
f1 weighted:  0.9516641149672989
f1 None:  [0.97589929 0.73746972]


In [None]:
 elif solver=='lbfgs':
        print('---------------')
        for itr in range(100,1251,250):
            model_regres = LogisticRegression(random_state=12345,solver=solver,penalty='l2',max_iter=itr)
            model_regres.fit(tf_idf_train, target_train)
            
            predicted = model_regres.predict(tf_idf_test)
            f1 = f1_score(target_test, predicted,average='micro', labels=np.unique(predicted))      
            
            models_results.append(['model_regres', 'Solver/Max_iter', str(solver)+'/'+str(itr),f1])

In [15]:
def lemmatize(text):
    m = Mystem()
    lemmas = m.lemmatize(text)
    return ''.join(lemmas)

In [21]:
model = LogisticRegression(random_state=0).fit(tf_idf_train, target_train)
predict = model.predict(tf_idf_test)



In [None]:
sl_list=['liblinear','lbfgs']
for solver in sl_list:
    if solver=='liblinear':
        for itr in range(100,1251,250):
            model_regres = LogisticRegression(random_state=12345,solver='liblinear',penalty='l1',max_iter=itr)
            model_regres.fit(tf_idf_train, target_train)
            
            predicted = model_regres.predict(tf_idf_test)
            f1 = f1_score(target_test, predicted,average='micro', labels=np.unique(predicted))    
            
            models_results.append(['model_regres', 'Solver/Max_iter', str(solver)+'/'+str(itr),f1])

In [None]:
corpus_lemm = lemmatize(corpus[:])

In [None]:
for i in range(len(corpus)):
    comm.loc[i:'text_lemm'] = lemmatize(corpus[i])

In [1]:
#train_values = features_train['ready_text'].values.astype('U')
#corpus = comm['text'].values.astype('U')

NameError: name 'features_train' is not defined