# Then we scrap the film review and its label using web scrapping and create csv file from that.

In [2]:
import pandas as pd

df = pd.read_csv('train.csv')

# Data Preprocessing

In [3]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,4a16050093a99092,"No, you have to prove that I can't.",0,0,0,0,0,0
1,6bc2122dc27e275a,Re: My major edit summaries \n\nI have receive...,0,0,0,0,0,0
2,5570f7207bade92f,I depersonalized it. It's sad how you don't ob...,0,0,0,0,0,0
3,a17d6408ffb98139,Thats because I was blocked before I could dis...,0,0,0,0,0,0
4,11b1d25d77ffd834,"I apologise for this, I was just angry with ha...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
111694,67aada13e8e980a4,"Honestly, Olaf, the POV fork argument just doe...",0,0,0,0,0,0
111695,fc671995426240b9,Public Domain Image Needed \n\nCan anyone uplo...,0,0,0,0,0,0
111696,b40837a8ba4e5b7e,Unban this ip address or a new online encyclop...,1,0,1,1,1,0
111697,85b7bb9f01b3bef1,"RV \nSorry about the RV, went to check on this...",0,0,0,0,0,0


In [4]:
df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [7]:
df = df.iloc[:10000]

# First, we create RNN (or bag of words approach) to give text and get features from RNN

In [17]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

class RNN:
    def __init__(self, max_words=10000, max_len=150, embed_dim=128, lstm_units=128, dropout=0.2):
        self.max_words = max_words
        self.max_len = max_len
        self.embed_dim = embed_dim
        self.lstm_units = lstm_units
        self.dropout = dropout
        self.tokenizer = Tokenizer(num_words=self.max_words)
        self.model = Sequential()
        self.model.add(Embedding(input_dim=self.max_words, output_dim=self.embed_dim, input_length=self.max_len))
        self.model.add(Bidirectional(LSTM(units=self.lstm_units, dropout=self.dropout, recurrent_dropout=self.dropout)))
        self.model.add(Dense(units=1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    def preprocess_data(self, comments, labels):
        self.tokenizer.fit_on_texts(comments)
        sequences = self.tokenizer.texts_to_sequences(comments)
        padded_sequences = pad_sequences(sequences, maxlen=self.max_len)
        labels = np.array(labels)
        return padded_sequences, labels
    
    def train_test_split_data(self, X, y, test_size=0.2, random_state=42):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test
    
    def train_model(self, X_train, y_train, batch_size=32, epochs=1, validation_data=None):
        self.model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=validation_data)
    
    def evaluate_model(self, X_test, y_test):
        loss, accuracy = self.model.evaluate(X_test, y_test)
        print("Test loss:", loss)
        print("Test accuracy:", accuracy)

In [18]:
comments = df["comment_text"].values.tolist()
labels = df["toxic"].values.tolist()

rnn = RNN()
X, y = rnn.preprocess_data(comments, labels)
X_train, X_test, y_train, y_test = rnn.train_test_split_data(X, y)
rnn.train_model(X_train, y_train, validation_data=(X_test, y_test))
rnn.evaluate_model(X_test, y_test)



Test loss: 0.18440699577331543
Test accuracy: 0.9430000185966492


# Then we get features from each film review. And create huge csv

In [19]:
from tensorflow.keras.models import Model

class FeatureExtractor:
    def __init__(self, rnn):
        self.lstm = rnn.model.layers[1] # this will give lstm layer from rnn
        
        self.model = Model(inputs=rnn.model.input, outputs=self.lstm.output) # we will create new model that will output the lstm layer directly
    
    def extract_features(self, X):
        return self.model.predict(X)

In [20]:
fe = FeatureExtractor(rnn) # extract features from dataset: 128 features from each row
features = fe.extract_features(X)



In [21]:
features.shape

(10000, 256)

In [22]:
features

array([[ 0.04809287,  0.03946817, -0.06752092, ..., -0.07937111,
         0.12188628,  0.17288674],
       [ 0.04532965,  0.06307178, -0.04092984, ..., -0.07937111,
         0.12188628,  0.17288674],
       [-0.00569175, -0.00604582,  0.00714082, ..., -0.07937112,
         0.12188628,  0.17288674],
       ...,
       [ 0.14676838,  0.11157919, -0.18422341, ..., -0.07937111,
         0.12188627,  0.17288674],
       [ 0.01243918,  0.01899815, -0.03120851, ..., -0.07937111,
         0.12188628,  0.17288674],
       [ 0.04812531,  0.04296872, -0.04840525, ..., -0.06047022,
         0.08444085,  0.16743718]], dtype=float32)

In [25]:
new_dataset = np.concatenate((features, np.array(df.toxic).reshape(-1, 1)), axis=1)
new_dataset.shape

(10000, 257)

In [28]:
new_dataset = pd.DataFrame(new_dataset)
new_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,0.048093,0.039468,-0.067521,-0.075772,0.057751,0.122186,-0.097414,0.076574,-0.124151,0.127239,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0
1,0.045330,0.063072,-0.040930,-0.067218,0.047897,0.071021,-0.123213,0.115749,-0.179058,0.154814,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0
2,-0.005692,-0.006046,0.007141,-0.050606,0.059983,0.096358,-0.050049,0.040247,-0.073477,0.090478,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0
3,0.105037,0.085000,-0.114389,-0.107287,0.048096,0.077839,-0.177038,0.145880,-0.237300,0.175787,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0
4,0.109527,0.085902,-0.134017,-0.114765,0.062802,0.100397,-0.164198,0.165334,-0.242468,0.189257,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.136193,0.108208,-0.163738,-0.113066,0.056395,0.089491,-0.189886,0.201010,-0.267062,0.190707,...,-0.410673,-0.278479,-0.261331,0.082519,0.097686,0.041910,-0.083716,0.089351,0.169146,0.0
9996,0.112434,0.090425,-0.122035,-0.113189,0.063966,0.117686,-0.148866,0.133675,-0.223280,0.184527,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0
9997,0.146768,0.111579,-0.184223,-0.119454,0.054564,0.098779,-0.196948,0.214242,-0.273064,0.206679,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0
9998,0.012439,0.018998,-0.031209,-0.072845,0.078142,0.118645,-0.036426,0.054806,-0.089479,0.087507,...,-0.443482,-0.300731,-0.345783,0.076296,0.084662,0.039722,-0.079371,0.121886,0.172887,0.0


In [29]:
new_dataset.to_excel('features_from_10k_comment.xlsx', index=False)

# Then we use LigthGBM or XGBoost to classify reviews for sentiment analysis.

In [30]:
!pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [53]:
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = new_dataset[256]
X = new_dataset.iloc[::,:256]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


print('Accuracy is ', accuracy_score(y_test, y_pred))

Accuracy is  0.943


In [54]:
import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy is ', accuracy_score(y_test, y_pred))

Accuracy is  0.942


# Hyperparameter Tuning with Optuna

In [66]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0
  Downloading alembic-1.10.4-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.9/212.9 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.4 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.1


In [69]:
import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm

def objective(trial):
    x, y = X_train,y_train

    classifier_name = trial.suggest_categorical("classifier", ["Random Forest","XGBoost", "LightGBM","GradientBoostingClassifier" ])
    if classifier_name == "Random Forest":
         from sklearn.ensemble import RandomForestClassifier
         max_depth = trial.suggest_int("max_depth", 2,50)
         max_features = trial.suggest_int("max_features", 2,50)
         classifier_obj = RandomForestClassifier(random_state=17,  max_depth=max_depth, max_features=max_features )
        
         

    elif classifier_name == "XGBoost":
         from xgboost import XGBClassifier
         max_depth = trial.suggest_int("max_depth", 2,50)
         max_features = trial.suggest_int("max_features", 2,50)
         classifier_obj = XGBClassifier(random_state=17,  max_depth=max_depth, max_features=max_features )
        
         

    elif classifier_name == "LightGBM":
         import lightgbm as lgb
         max_depth = trial.suggest_int("max_depth", 2,50)
         max_features = trial.suggest_int("max_features",2,50)
         classifier_obj = lgb.LGBMClassifier(random_state=17,  max_depth=max_depth, max_features=max_features )
        
       
       
    else:
         max_depth = trial.suggest_int("max_depth", 2,50)
         max_features = trial.suggest_int("max_features", 2,50)
         from sklearn.ensemble import GradientBoostingClassifier
         classifier_obj = GradientBoostingClassifier(random_state=17,  max_depth=max_depth, max_features=max_features )
        
         

    accuracy=sklearn.model_selection.cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=3).mean()
   
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)
    print(study.best_trial)

[32m[I 2023-04-27 21:06:07,424][0m A new study created in memory with name: no-name-ff56b661-6b9a-4676-9062-415db6f515ae[0m
[32m[I 2023-04-27 21:06:19,495][0m Trial 0 finished with value: 0.9661250145682277 and parameters: {'classifier': 'GradientBoostingClassifier', 'max_depth': 41, 'max_features': 9}. Best is trial 0 with value: 0.9661250145682277.[0m
[32m[I 2023-04-27 21:06:25,774][0m Trial 1 finished with value: 0.9645000770252555 and parameters: {'classifier': 'Random Forest', 'max_depth': 2, 'max_features': 33}. Best is trial 0 with value: 0.9661250145682277.[0m
[32m[I 2023-04-27 21:06:45,730][0m Trial 2 finished with value: 0.9669999052068979 and parameters: {'classifier': 'Random Forest', 'max_depth': 20, 'max_features': 32}. Best is trial 2 with value: 0.9669999052068979.[0m
[32m[I 2023-04-27 21:06:58,784][0m Trial 3 finished with value: 0.9655002802069097 and parameters: {'classifier': 'LightGBM', 'max_depth': 31, 'max_features': 49}. Best is trial 2 with value:

FrozenTrial(number=2, state=TrialState.COMPLETE, values=[0.9669999052068979], datetime_start=datetime.datetime(2023, 4, 27, 21, 6, 25, 777374), datetime_complete=datetime.datetime(2023, 4, 27, 21, 6, 45, 729663), params={'classifier': 'Random Forest', 'max_depth': 20, 'max_features': 32}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('Random Forest', 'XGBoost', 'LightGBM', 'GradientBoostingClassifier')), 'max_depth': IntDistribution(high=50, log=False, low=2, step=1), 'max_features': IntDistribution(high=50, log=False, low=2, step=1)}, trial_id=2, value=None)


In [70]:
study.best_params

{'classifier': 'Random Forest', 'max_depth': 20, 'max_features': 32}

In [72]:
from sklearn.ensemble import RandomForestClassifier

tuned_model = RandomForestClassifier(max_depth = 20, max_features = 32)
tuned_model.fit(X_train, y_train)
y_pred = tuned_model.predict(X_test)

print('Accuracy after tuning is ', accuracy_score(y_test, y_pred))

Accuracy after tuning is  0.944


# Comparison with State-Of-Art Techniques: Transformers

In [55]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [81]:
!pip install detoxify

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting detoxify
  Downloading detoxify-0.5.1-py3-none-any.whl (12 kB)
Collecting sentencepiece>=0.1.94
  Downloading sentencepiece-0.1.98-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.22.1
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, sentencepiece, transformers, detoxify
  Attempti

In [84]:
from detoxify import Detoxify

print("Example #1")
print('\nTRANSFORMERS PERFORMANCE\n')
print('Example sentence:', df['comment_text'].iloc[9000])
print(Detoxify('original').predict(df['comment_text'].iloc[9000]))

print('\n\nOUR PERFORMANCE')
print(tuned_model.predict([X.iloc[9000]]))

print('\n\nActual label')
print(df['toxic'].iloc[9000])

print("\n\n\n\nExample #2")
print('\nTRANSFORMERS PERFORMANCE\n')
print('Example sentence:', df['comment_text'].iloc[9001])
print(Detoxify('original').predict(df['comment_text'].iloc[9001]))

print('\n\nOUR PERFORMANCE')
print(tuned_model.predict([X.iloc[9001]]))

print('\n\nActual label')
print(df['toxic'].iloc[9001])

Example #1

TRANSFORMERS PERFORMANCE

Example sentence: The PTC is not a critic, but a parents group. They are not reliable for critical reaction. Thank you.
{'toxicity': 0.0006550305, 'severe_toxicity': 0.00011618549, 'obscene': 0.00017535572, 'threat': 0.00011278491, 'insult': 0.00017681593, 'identity_attack': 0.00013972318}


OUR PERFORMANCE
[0.]


Actual label
0




Example #2

TRANSFORMERS PERFORMANCE

Example sentence: "

The titles aren't fictional.  Fiction means ""not real.""  Unless Ric wandered around with some kind of holographic projection around his waist, the titles aren't fictional."
{'toxicity': 0.00075165153, 'severe_toxicity': 0.000110687564, 'obscene': 0.00017966599, 'threat': 0.000109057466, 'insult': 0.00018429189, 'identity_attack': 0.00013785088}


OUR PERFORMANCE
[0.]


Actual label
0


# DONE!