<a href="https://colab.research.google.com/github/mfsadi/amerSentiment/blob/main/Amer_Sentiment_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin /content/drive/MyDrive/   

In [None]:
!pip install -q hazm
!pip install -q clean-text[gpl]

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from cleantext import clean

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

In [3]:
train = pd.read_csv('/content/train.csv', error_bad_lines=False, delimiter='\t')
dev = pd.read_csv('/content/dev.csv', error_bad_lines=False, delimiter='\t')
test = pd.read_csv('/content/test.csv', error_bad_lines=False, delimiter='\t')

train = train[['comment', 'label', 'label_id']]
dev = dev[['comment', 'label', 'label_id']]
test = test[['comment', 'label', 'label_id']]

In [4]:
train['comment_len_by_words'] = train['comment'].apply(lambda t: len(hazm.word_tokenize(t)))
dev['comment_len_by_words'] = dev['comment'].apply(lambda t: len(hazm.word_tokenize(t)))
test['comment_len_by_words'] = test['comment'].apply(lambda t: len(hazm.word_tokenize(t)))

In [None]:
min_max_len = test["comment_len_by_words"].min(), test["comment_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

In [None]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='comment_len_by_words'):
    data_length = data[col].values
    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])
    data_glt_rate = (data_glt / len(data_length)) * 100
    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
data_gl_than(dev, 40, 3)

In [5]:
minlim, maxlim = 3, 40

In [6]:
# remove comments with the length of fewer than three words
train['comment_len_by_words'] = train['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
train = train.dropna(subset=['comment_len_by_words'])
train = train.reset_index(drop=True)
dev['comment_len_by_words'] = dev['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
dev = dev.dropna(subset=['comment_len_by_words'])
dev = dev.reset_index(drop=True)
test['comment_len_by_words'] = test['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
test = test.dropna(subset=['comment_len_by_words'])
test = test.reset_index(drop=True)

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=test['comment_len_by_words']
))

fig.update_layout(
    title_text='Distribution of word counts within comments',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [7]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def cleaning(text):
    text = text.strip()
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text

In [None]:
# cleaning comments
train['cleaned_comment'] = train['comment'].apply(cleaning)
dev['cleaned_comment'] = dev['comment'].apply(cleaning)
test['cleaned_comment'] = test['comment'].apply(cleaning)


# calculate the length of comments based on their words
train['cleaned_comment_len_by_words'] = train['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))
dev['cleaned_comment_len_by_words'] = dev['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))
test['cleaned_comment_len_by_words'] = test['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))

# remove comments with the length of fewer than three words
train['cleaned_comment_len_by_words'] = train['cleaned_comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
train = train.dropna(subset=['cleaned_comment_len_by_words'])
train = train.reset_index(drop=True)
dev['cleaned_comment_len_by_words'] = dev['cleaned_comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
dev = dev.dropna(subset=['cleaned_comment_len_by_words'])
dev = dev.reset_index(drop=True)
test['cleaned_comment_len_by_words'] = test['cleaned_comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
test = test.dropna(subset=['cleaned_comment_len_by_words'])
test = test.reset_index(drop=True)

train.head()
dev.head()
test.head()

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train= train['comment']
X_test= test['comment']
y_train= train['label']
y_test= test['label']

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
X_train = vectorizer.fit_transform(X_train)
print("n_samples: %d, n_features: %d" % X_train.shape)
X_test = vectorizer.transform(X_test)
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(vectorizer, fin)
print("n_samples: %d, n_features: %d" % X_test.shape)
#text_classifier.fit(train['comment'], train['label_id'])

In [11]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=['HAPPY','SAD']))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    pickle.dump(clf, open(name, 'wb'))
    return clf_descr, score, train_time, test_time


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
                                       tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
                                           penalty=penalty)))


# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                  tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None, solver='sag',
                tol=0.01)



"sag" solver requires many iterations to fit an intercept with sparse inputs. Either set the solver to "auto" or "sparse_cg", or set a low "tol" and a high "max_iter" (especially if inputs are not standardized).



train time: 0.651s
test time:  0.001s
accuracy:   0.838
dimensionality: 21662
density: 1.000000
classification report:
              precision    recall  f1-score   support

       HAPPY       0.87      0.82      0.84      3295
         SAD       0.81      0.86      0.84      3037

    accuracy                           0.84      6332
   macro avg       0.84      0.84      0.84      6332
weighted avg       0.84      0.84      0.84      6332

confusion matrix:
[[2687  608]
 [ 417 2620]]

Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=50, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)
train time: 0.223s
test time:  0.001s
accuracy:   0.775
dimensionality: 21662
density: 0.768489
classification report:
  

In [None]:
model = pickle.load(open('kNN', 'rb'))
sample=['شیرینی کیفیت خوبی نداشت']
vt=pickle.load(open("vectorizer.pk", "rb"))
unseen_tfidf = vt.transform(sample)
result= model.predict(unseen_tfidf)
print(result)

['SAD']
