## [phase 1] Search for a good model
In this phase we only use train.csv file to find a good model

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import f1_score

### 1-1 Prepare data

#### read data

In [2]:
data = pd.read_csv("sentiment_data/train.csv")
data

Unnamed: 0,comment,sentiment
0,کس میدونه چه جوری از این ها میشه شکایت کرد لطف...,Negative
1,اف بر شهرداری که درخت را وسط میدان انداخته. طر...,Negative
2,خیلی جای بکری هس حتما یه سر برید👌,Positive
3,آب بسیار کثیف است، متراژ هم کم,Negative
4,افتضاح چون یه شماره تماس نداره خیرسرش,Negative
...,...,...
2538,اصلا کیفیت نداره از سر مجبوری اومدیم اتاق کثیف...,Negative
2539,بسیار عالی بدون سردرد,Positive
2540,برای زیارت و استراحتی کوتاه خوبه در ضمن کتاب ف...,Positive
2541,جای خوبی نیست .یه دونه کافی شاپ امیر شکلات بود...,Negative


#### ecode sentiment categories

In [3]:
X = np.array(data.iloc[:,0])
y = np.array(data.iloc[:,1])

print(y)

LE = LabelEncoder()
y = LE.fit_transform(y)

print(y)

['Negative' 'Negative' 'Positive' ... 'Positive' 'Negative' 'Positive']
[0 0 2 ... 2 0 2]


#### split data into train and test set

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(f'X_train : {X_train.shape}')
print(f'X_test  : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test  : {y_test.shape}')

X_train : (1703,)
X_test  : (840,)
y_train : (1703,)
y_test  : (840,)


### 1-2 Text cleaning

#### remove emojies
Emojies basically show sentiment of writer, should we clean them or no?!
Our porpuse is to extract sentiment of a text, but using emojies can be a cheat for out model. I decided to remove all emojies because my model must only rely on the context of a sample not just some random personal emojies! Althoug this decision (clean emolies or not) should be taken by the scenior data scientis of the team.

In [5]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", flags=re.UNICODE)

vfunc = np.vectorize(lambda s: emoji_pattern.sub(r'', s))
X_test = vfunc(X_test)
X_train = vfunc(X_train)

#### remove unwanted words, punctuations and characters

In [6]:
with open('nlp_files/stop_words-fa.txt', mode='r', encoding='utf-8') as stop_words_file:
    stop_words = stop_words_file.read().split('\n')
    
with open('nlp_files/stop_puncs-fa.txt', mode='r', encoding='utf-8') as stop_puncs_file:
    stop_puncs = stop_puncs_file.read().split('\n') 

with open('nlp_files/stop_chars-fa.txt', mode='r', encoding='utf-8') as stop_chars_file:
    stop_chars = stop_chars_file.read().split('\n') 

def remove_stops(string):
    for char in stop_chars:
        string = string.replace(char, '') # remove stop-chars
        
    for punc in stop_puncs:
        string = string.replace(punc, ' ') # replace stop-punctuations with space
        
    words = [word.strip() for word in string.split(' ')] # split string to trimed words 
    words = list(filter(lambda word: len(word) > 0, words)) # remove empty words
    words = list(filter(lambda word: word not in stop_words, words)) # remove stop-words
    
    string = ' '.join(words)
    return string

vfunc = np.vectorize(lambda s: remove_stops(s))
X_test = vfunc(X_test)
X_train = vfunc(X_train)

### 1-3 Vectorization
There are several methods for extracting features of text such as **BAG-OF-WORDS**, **TF-IDF**, **GloVe**, **Word2Vec** and etc.
**BAG-OF-WORDS** and **TF-IDF** are based on counting the occurrences of words. In contrast, **GloVe** and **Word2Vec** are basically nural networks which are trained to extract feature. I'm going to apply **TF-IDF** and **Word2Vec** in this phase (finding a good model).

#### TF-IDF vectorization

In [7]:
vectorizer_TFIDF = TfidfVectorizer()
vectorizer_TFIDF.fit(X_train)

In [8]:
X_train_tfidf = vectorizer_TFIDF.transform(X_train).toarray()
X_test_tfidf = vectorizer_TFIDF.transform(X_test).toarray()

print(f'X_train_tfidf : {X_train_tfidf.shape}')
print(f'X_test_tfidf  : {X_test_tfidf.shape}')

X_train_tfidf : (1703, 5744)
X_test_tfidf  : (840, 5744)


#### Word2Vec vectorization

In [9]:
vector_size = 1000

train_sentences = [sentence.split() for sentence in X_train]
vectorizer_W2V = Word2Vec(train_sentences, 
                          vector_size=vector_size,
                          window=5,
                          min_count=3, 
                          workers=4, 
                          seed=42)
vectorizer_W2V

<gensim.models.word2vec.Word2Vec at 0x2394db97220>

In [10]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [vectorizer_W2V.wv[word] for word in words if word in vectorizer_W2V.wv]
    if len(words_vecs) == 0:
        return np.zeros(vector_size)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train_w2v = np.array([vectorize(sentence) for sentence in X_train])
X_test_w2v = np.array([vectorize(sentence) for sentence in X_test])

print(f'X_train_w2v : {X_train_w2v.shape}')
print(f'X_test_w2v  : {X_test_w2v.shape}')

X_train_w2v : (1703, 1000)
X_test_w2v  : (840, 1000)


### 1-4 Classifiers
There are many many different classifiers in machine learning with several techniques. But here, we just employ **KNN**, **SVM** and **XGboost** classifiers which are from three differenet families of classifiers. Each classifier has several parameters, so we need to create each classifier object multiple times with different parameters in order to find the optimal parameter for each.

#### KNN classifiers

In [11]:
clf_knn_1 = KNeighborsClassifier(n_neighbors=4)
clf_knn_2 = KNeighborsClassifier(n_neighbors=8)
clf_knn_3 = KNeighborsClassifier(n_neighbors=16)

#### SVM classifiers

In [12]:
clf_svm_1 = SVC(kernel='linear')
clf_svm_2 = SVC(kernel='poly')
clf_svm_3 = SVC(kernel='rbf')

#### XGBoost classifiers

In [13]:
clf_xgb_1 = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, random_state=42)
clf_xgb_2 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)
clf_xgb_3 = GradientBoostingClassifier(n_estimators=150, learning_rate=1.0, random_state=42)

### 1-5 Classification Test
Now we have 2 vectorized-samples and 9 classifiers. The last step is to train classifiers and evaluate them with test set to find the best classifier among all. F1-score is used as metric of evaluation.

#### fit and predict on each classifier

In [15]:
classifiers = [
    clf_knn_1, clf_knn_2, clf_knn_3,
    clf_svm_1, clf_svm_2, clf_svm_3,
    clf_xgb_1, clf_xgb_2, clf_xgb_3
]

classifier_names = [
    'KNN(n=4)', 'KNN(n=8)', 'KNN(n=16)',
    'SVM(linear)', 'SVM(poly)', 'SVM(rbf)',
    'XGB(n=50)', 'XGB(n=100)', 'XGB(n=150)', 
]

res = []
for i in range(len(classifiers)):
    clf = classifiers[i]
    print(f'Classification using {clf} ', end='... ')
    
    clf.fit(X_train_tfidf, y_train) # using TF-IDF vectors
    y_pred = clf.predict(X_test_tfidf)  
    score_tfidf = f1_score(y_test, y_pred, average='micro')
    
    clf.fit(X_train_w2v, y_train) # using Word2Vec vectors
    y_pred = clf.predict(X_test_w2v)  
    score_w2v = f1_score(y_test, y_pred, average='micro')
    
    print('[DONE]')
    res.append([classifier_names[i], score_tfidf, score_w2v])

Classification using KNeighborsClassifier(n_neighbors=4) ... DONE
Classification using KNeighborsClassifier(n_neighbors=8) ... DONE
Classification using KNeighborsClassifier(n_neighbors=16) ... DONE
Classification using SVC(kernel='linear') ... DONE
Classification using SVC(kernel='poly') ... DONE
Classification using SVC() ... DONE
Classification using GradientBoostingClassifier(learning_rate=1.0, n_estimators=50, random_state=42) ... DONE
Classification using GradientBoostingClassifier(learning_rate=1.0, random_state=42) ... DONE
Classification using GradientBoostingClassifier(learning_rate=1.0, n_estimators=150, random_state=42) ... DONE


#### display the result

In [21]:
from IPython.display import HTML, display

html = "<table>"
html += f"<tr> <td><h3>Classifiers</h3></td> <td><h3>TF-IDF</h3></td> <td><h3>Word2Vec</h3></td> </tr>"
for i in range(len(classifiers)):
    html += f"<tr> <td><h4>{res[i][0]}</h4></td> <td>{res[i][1]:.4f}</td> <td>{res[i][2]:.4f}</td> </tr>"
html += "</table>"

display(HTML(html))

0,1,2
Classifiers,TF-IDF,Word2Vec
KNN(n=4),0.4619,0.6131
KNN(n=8),0.6940,0.6298
KNN(n=16),0.7524,0.6238
SVM(linear),0.7905,0.4774
SVM(poly),0.6417,0.6452
SVM(rbf),0.7905,0.6810
XGB(n=50),0.7214,0.6571
XGB(n=100),0.7298,0.6786
XGB(n=150),0.7381,0.6714
