***Use this cell only when you use google colab***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

***Import Libraries and Download NLTK Data***

In [None]:
import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

***Define Preprocessing Function***

In [None]:
def preprocess_socialmediacomments(text):
    if isinstance(text, str):
        text = re.sub('<[^>]*>', '', text)
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
        text = re.sub('[\W]+', ' ', text.lower())
        text = text + ' '.join(emoticons).replace('-', '')
        return text
    else:
        return ''

***Load Dataset and Apply Preprocessing***

In [None]:
tqdm.pandas()

df = pd.read_excel('/content/drive/MyDrive/Test Bangla & English Harmful Comment Dataset.xlsx')
df['Comment'] = df['Comment'].progress_apply(preprocess_socialmediacomments)

***Tokenization and Vectorization***

In [None]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)

***Train the Model and Save***

In [None]:
X = df["Comment"].to_list()
y = df['Harmful']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

clf = SGDClassifier(loss='log', random_state=1)
classes = np.array([0, 1])

clf.partial_fit(X_train, y_train, classes=classes)

# Save the trained model using pickle in your specified path
save_path = '/your/specified/path/trained_model.pkl'
with open(save_path, 'wb') as model_file:
    pickle.dump(clf, model_file)

print('Accuracy: %.3f' % clf.score(X_test, y_test))