In [None]:
#use it cell only when you use google colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pickle
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('stopwords')

In [None]:
def preprocess_socialmediacomments(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text+' '.join(emoticons).replace('-', '') 
    return text

In [None]:
tqdm.pandas()
#put on your csv file
df = pd.read_csv('')
df['Comment'] = df['Comment'].progress_apply(preprocess_socialmediacomments)

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
[w for w in tokenizer_porter('সত্যি গানটা অসাধারণ লেগেছে') if w not in stop]

In [None]:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

In [None]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21,preprocessor=None,tokenizer=tokenizer)

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log', random_state=1)

In [None]:
X = df["Comment"].to_list()
y = df['Harmful']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [None]:
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [None]:
classes = np.array([0, 1])
clf.partial_fit(X_train, y_train,classes=classes)

In [None]:
print('Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
clf = clf.partial_fit(X_test, y_test)

In [None]:
label = {0:'ক্ষতিকর না', 1:'ক্ষতিকর'}
example = ["একটু হিসাব করে দেখেন, আর্জেন্টিনার জনসংখ্যা ৪ কোটি... আর আমাদের ১৬ কোটির দেশে প্রায় অর্ধেকই  (৮ কোটি) হলো আর্জেন্টিনার সাপোর্টার"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
%(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

In [None]:
label = {0:'Good', 1:'Harmful'}
example = ["Go to Hell. Fuck you"]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%'
%(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))