In [1]:
# Basic Settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from IPython.display import display
import re

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# load datasets

# First Dataset : Tw text Dataset
tw_df_origin = pd.read_csv('data/tw_data.csv')

# Second Dataset : Wiki Pedia Dataset
wiki_df_origin = pd.read_csv('data/wiki_train.csv').drop(columns='id')

In [18]:
# pre-process for merging two DFs

# Pre-processing TW_df
tw_texts = tw_df_origin['tweet']\
            .map(lambda x : re.sub(r'.+! RT.', '', x).strip())\
            .to_frame().rename(columns={'tweet': 'text'})
tw_labels = tw_df_origin['class']\
            .map(lambda x : 0 if x == 2 else 1)\
            .to_frame().rename(columns={'class':'label'})

# Pre-processing Wiki_df
wiki_texts = wiki_df_origin['comment_text']\
            .to_frame().rename(columns={'comment_text':'text'})
wiki_labels = wiki_df_origin.iloc[:, 1:].sum(axis=1)\
            .map(lambda x : 1 if x > 0 else 0).to_frame()\
            .rename(columns={0:'label'})

In [19]:
# Merging Two DFs

tw_df = pd.concat([tw_texts, tw_labels], axis=1)
wiki_df = pd.concat([wiki_texts, wiki_labels], axis=1)

display(tw_df.head())
display(wiki_df.head())

Unnamed: 0,text,label
0,@mayasolovely: As a woman you shouldn't compla...,0
1,@mleew17: boy dats cold...tyga dwn bad for cuf...,1
2,@80sbaby4life: You ever fuck a bitch and she s...,1
3,@C_G_Anderson: @viva_based she look like a tranny,1
4,@ShenikaRoberts: The shit you hear about me mi...,1


Unnamed: 0,text,label
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [25]:
# 통합된 하나의 Train Data 선언

text_df = pd.concat([wiki_df, tw_df]).drop(columns='label')
label_df = pd.concat([wiki_df, tw_df]).drop(columns='text')

In [6]:
import nltk
import joblib

In [28]:
from d_pre import tokenize, make_features, make_features2

token_words = tokenize(text_df)
token_words

Unnamed: 0,text
0,"[explanation, edits, made, username, hardcore,..."
1,"[aww, match, background, colour, seemingly, st..."
2,"[hey, man, really, trying, edit, war, guy, con..."
3,"[make, real, suggestion, improvement, wondered..."
4,"[sir, hero, chance, remember, page]"
...,...
24778,"[muthaf, lie, lifeasking, pearl, corey, emanue..."
24779,"[gone, broke, wrong, heart, baby, drove, redne..."
24780,"[young, buck, wanna, eat, dat, nigguh, like, a..."
24781,"[youu, got, wild, bitch, tellin, lie]"


In [29]:
f1_df = make_features(text_df)

In [30]:
f1_df

Unnamed: 0,tot_len,nl,you,Cap,exclamation,question,smile,stop_words,punc
0,264,1,0,17,0,1,0,14,0
1,112,0,0,8,1,0,0,1,0
2,233,0,0,4,0,0,0,18,0
3,622,4,2,11,0,0,0,49,4
4,67,0,2,2,0,1,0,4,0
...,...,...,...,...,...,...,...,...,...
24778,146,0,1,9,1,1,0,3,0
24779,70,0,1,0,0,0,0,5,0
24780,67,0,1,1,2,0,0,2,0
24781,37,0,2,0,0,0,0,1,0


In [31]:
f2_df = make_features2(token_words)

In [32]:
f2_df

Unnamed: 0,profanity,most_rep
0,0,1
1,0,1
2,0,1
3,0,2
4,0,1
...,...,...
24778,0,1
24779,2,1
24780,3,1
24781,1,1


In [33]:
f_df = pd.concat([f1_df, f2_df], axis=1)

In [34]:
f_df

Unnamed: 0,tot_len,nl,you,Cap,exclamation,question,smile,stop_words,punc,profanity,most_rep
0,264,1,0,17,0,1,0,14,0,0,1
1,112,0,0,8,1,0,0,1,0,0,1
2,233,0,0,4,0,0,0,18,0,0,1
3,622,4,2,11,0,0,0,49,4,0,2
4,67,0,2,2,0,1,0,4,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
24778,146,0,1,9,1,1,0,3,0,0,1
24779,70,0,1,0,0,0,0,5,0,2,1
24780,67,0,1,1,2,0,0,2,0,3,1
24781,37,0,2,0,0,0,0,1,0,1,1


In [36]:
w_df = pd.concat([f_df, label_df], axis=1)

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


label_name = 'label'
feature_names = w_df.drop(columns=label_name).columns


train_ratio = 0.70
val_ratio = 0.15
test_ratio = 0.15

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(w_df[feature_names], 
                                                    w_df[label_name], 
                                                    test_size=1 - train_ratio, 
                                                    stratify=w_df[label_name], 
                                                    random_state=42)

# Validation Test Split
X_val, X_test, y_val, y_test = train_test_split(X_test, 
                                                y_test, 
                                                test_size=test_ratio / (test_ratio + val_ratio), 
                                                stratify=y_test, 
                                                random_state=42)

In [43]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(subsample=0.8,
                        n_estimators=500,
                        min_child_weight=5,
                        max_depth=50,
                        learning_rate=0.01,
                        gamma=1.5,
                        colsample_bytree=0.8,
                        random_state=42)

In [44]:
xgb_clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=50,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
xgb_pred = xgb_clf.predict(X_val)

In [46]:
# validation set에 대한 f1_score 점수는 다음과 같다.
f1_score(y_val, xgb_pred, average='micro')

0.9506744295374824

In [47]:
xgb_pred = xgb_clf.predict(X_test)

In [48]:
# test set에 대한 f1 score 점수는 다음과 같다.
f1_score(y_test, xgb_pred, average='micro')

0.9482895783611774

In [50]:
joblib.dump(xgb_clf, 'profanity_clf')

['profanity_clf']

In [51]:
test = joblib.load('profanity_clf')

In [52]:
xgb_pred = test.predict(X_test)

In [53]:
f1_score(y_test, xgb_pred, average='micro')

0.9482895783611774