#  <div style="color: black">تمرین دوم: تشخیص هرزنامه ها</div>
## <div style="color: blue">Mohammad Hossein Malekpour | 9613425</div>
_______________________________


## Import Necessary Libraries

In [1]:
import os
import re
import string
import hazm
import numpy as np
import pandas as pd

In [2]:
from scipy import stats, sparse
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score

## Load Data

In [3]:
def read_data(path):
    files = [file for file in os.listdir(path)]
    emails = []
    for file in files:
        with open(path + file, 'r', encoding='utf-8') as txt_file:
            emails.append(txt_file.read())
    return emails
    
ham_test = read_data('./emails/hamtesting/')
ham_train = read_data('./emails/hamtraining/')
spam_test = read_data('./emails/spamtesting/')
spam_train = read_data('./emails/spamtraining/')

In [4]:
ham_test_lable = {'text': ham_test, 'lable': ['ham' for i in range(len(ham_test))]}
ham_train_lable = {'text': ham_train, 'lable': ['ham' for i in range(len(ham_train))]}
spam_test_lable = {'text': spam_test, 'lable': ['spam' for i in range(len(spam_test))]}
spam_train_lable = {'text': spam_train, 'lable': ['spam' for i in range(len(spam_train))]}

train_data = pd.concat([pd.DataFrame(ham_train_lable), pd.DataFrame(spam_train_lable)])
test_data = pd.concat([pd.DataFrame(ham_test_lable), pd.DataFrame(spam_test_lable)])

train_data['spam'] = train_data['lable'].replace(['spam','ham'],[1, 0])
test_data['spam'] = test_data['lable'].replace(['spam','ham'],[1, 0])

train_data.tail(10)

Unnamed: 0,text,lable,spam
290,﻿\n-------------------------------------------...,spam,1
291,﻿\nCompletely free tracking for websites <http...,spam,1
292,﻿\n \n \n\nاتاقک : بهتـرین جامعـه مجـازی فا...,spam,1
293,﻿\n-------------------------------------------...,spam,1
294,﻿\n\nazk2ylc0sxx2mk6qvp5y.jpg\n<http://www.8pi...,spam,1
295,﻿\nبسمه تعالی\n\nسازمان زیباسازی شهرداری استان...,spam,1
296,﻿\n\nبه مناسبت فرا رسیدن میلاد دخت پیامبر گرام...,spam,1
297,﻿\nدرود هموطن من\n\n \n\nتست رایگان \n\n ...,spam,1
298,﻿\n\n *درج **لینک در 8700 وبلاگ\n *\n\n...,spam,1
299,﻿\nسلام به دوستان عزیز\nشما هم میتوانید از این...,spam,1


## Pre-Proceess Data

### 1. Normalize

In [5]:
persian_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = persian_punctuations + english_punctuations

arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

def normalize_persian(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ي", "ی", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ی", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("ك" ,"ک" , text)
    text = re.sub("[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]", " ", text)
    text = re.sub("[^\S\n\t]+", ' ', text)
    return text

In [6]:
train_data['text'] = train_data['text'].apply(remove_diacritics)
train_data['text'] = train_data['text'].apply(remove_punctuations)
train_data['text'] = train_data['text'].apply(remove_repeating_char)
train_data['text'] = train_data['text'].apply(normalize_persian)
test_data['text'] = test_data['text'].apply(remove_diacritics)
test_data['text'] = test_data['text'].apply(remove_punctuations)
test_data['text'] = test_data['text'].apply(remove_repeating_char)
test_data['text'] = test_data['text'].apply(normalize_persian)

train_data.tail(10)

Unnamed: 0,text,lable,spam
290,اموزش زبان بهترین اموزش زبان با استفاده از صو...,spam,1
291,در مشاهده تصاویر مشکل دارید لطفا اینجا کلیک ک...,spam,1
292,اتاقک بهترین جامعه مجازی فارسی برای ایرانیان ...,spam,1
293,فروشگاه اینترنتی با بهترین محصولات برای تمامی...,spam,1
294,ایا می خواهید از وقت خود به بهترین نحو استفاد...,spam,1
295,بسمه تعالی سازمان زیباسازی شهرداری استان تهرا...,spam,1
296,به مناسبت فرا رسیدن میلاد دخت پیامبر گرامی اس...,spam,1
297,درود هموطن من تست رایگان تحویل اکانت پرداخت و...,spam,1
298,درج لینک در وبلاگ درج لینک و تبلیغات متنی شما...,spam,1
299,سلام به دوستان عزیز شما هم میتوانید از اینترن...,spam,1


### 2. Word Tokenize

In [7]:
train_data['text'] = train_data['text'].apply(lambda txt: txt.split())
test_data['text'] = test_data['text'].apply(lambda txt: txt.split())

train_data.tail()

Unnamed: 0,text,lable,spam
295,"[بسمه, تعالی, سازمان, زیباسازی, شهرداری, استان...",spam,1
296,"[به, مناسبت, فرا, رسیدن, میلاد, دخت, پیامبر, گ...",spam,1
297,"[درود, هموطن, من, تست, رایگان, تحویل, اکانت, پ...",spam,1
298,"[درج, لینک, در, وبلاگ, درج, لینک, و, تبلیغات, ...",spam,1
299,"[سلام, به, دوستان, عزیز, شما, هم, میتوانید, از...",spam,1


### 3. Remove Stopwords 

In [8]:
stopwords = hazm.stopwords_list()

def remove_stopwords(tokens):
    for token in tokens:
        if token in stopwords:
            tokens.remove(token)
    return tokens
            
train_data['text'] = train_data['text'].apply(remove_stopwords)
test_data['text'] = test_data['text'].apply(remove_stopwords)

train_data.tail()

Unnamed: 0,text,lable,spam
295,"[بسمه, تعالی, سازمان, زیباسازی, شهرداری, استان...",spam,1
296,"[مناسبت, فرا, میلاد, دخت, پیامبر, گرامی, اسلام...",spam,1
297,"[درود, هموطن, تست, رایگان, تحویل, اکانت, پرداخ...",spam,1
298,"[درج, لینک, وبلاگ, درج, لینک, تبلیغات, متنی, ب...",spam,1
299,"[سلام, دوستان, عزیز, هم, میتوانید, اینترنت, ری...",spam,1


### 4. Stemming & lemmatization 

In [9]:
def stemming(tokens):
    stemmer = hazm.Stemmer()
    s_tokens = set()
    for token in tokens:
        s_tokens.add(stemmer.stem(token))
    return list(s_tokens)
    
def lemmatization(tokens):
    lemmatizer = hazm.Lemmatizer()
    l_tokens = set()
    for token in tokens:
        l_tokens.add(lemmatizer.lemmatize(token))
    return list(l_tokens)

train_data['text'] = train_data['text'].apply(stemming)
test_data['text'] = test_data['text'].apply(lemmatization)

train_data.tail()

Unnamed: 0,text,lable,spam
295,"[, تماس, داخل, حقیق, قرینه, مید, جه, ارتباط, ت...",spam,1
296,"[, هست, هدیه, گرد, کلیک, امید, شما, اسلا, انشا...",spam,1
297,"[, فقط, سرع, تس, افتاح, شما, رایگ, اکان, ایجاد...",spam,1
298,"[, درصد, به, اصل, یکساله, تماس, بلاگساز, وبلاگ...",spam,1
299,"[, طرح, به, گوگل, سرمایه, نمیخواهید, کرو, گف, ...",spam,1


## TF-IDF Vectorizer

In [10]:
train_words = set()
for text in train_data['text']:
    for word in text:
        train_words.add(word)
        
test_words = set()
for text in test_data['text']:
    for word in text:
        test_words.add(word)

In [11]:
x_train = train_data.text.to_list()
y_train = train_data.lable.to_list()
x_test = test_data.text.to_list()
y_test = test_data.lable.to_list()

In [12]:
def tfidf_vectorize(words, data):
    word_index = dict()
    for index, word in enumerate(words):
        word_index[word] = index
    DF = {}
    for tokens in data:
        distinct_tokens = set(tokens)
        for token in distinct_tokens:
            if token not in DF:
                DF[token] = 0
            DF[token] += 1
    row = list()
    col = list()
    vector = list()
    for index, text in enumerate(data):
        WF = {}
        for word in text:
            if word in words:
                if word not in WF:
                    WF[word] = 0
                WF[word] += 1
        for word in WF.keys():
            row.append(index)
            col.append(word_index[word])
            tfidf = (int(((np.log10(WF[word]) + 1) * (np.log10(len(data) / DF[word]))) * 100)) / 100
            vector.append(tfidf)    
    result = sparse.csr_matrix((vector, (row, col)), shape=(len(data), len(words))).toarray()
    return result

In [13]:
x_train_vector = tfidf_vectorize(train_words, x_train)
x_test_vector = tfidf_vectorize(train_words, x_test)

In [14]:
x_train_vector

array([[0.09, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.09, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.09, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.09, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.09, 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

## Feature Extraction with Chi-Squared

In [15]:
transformer = SelectKBest(chi2, k = 500).fit(x_train_vector, y_train)
x_train_vector_best = transformer.transform(x_train_vector)
x_test_vector_best = transformer.transform(x_test_vector)

## KNN Implementation with Cosine Similarities

In [16]:
def cos_sim_knn(k, x_train, y_train, x_test):
    sim = cosine_similarity(x_test, x_train)
    result = list()
    for i in range(len(sim)):
        nearests = np.argsort(sim[i])[::-1][:k]
        nearest_lables = list()
        for n in nearests:
            nearest_lables.append(y_train[n])
        tmp = stats.mode(nearest_lables)[0]
        result.append(tmp)
    return result

## KNN Implementation with TFIDF Score

In [17]:
def tfidf_knn(k, x_train, y_train, x_test):
    train = list()
    for text in x_train:
        tmp = list()
        for tfidf in text:
            tmp.append(tfidf)
        train.append(sum(tmp))
        
    test = list()
    for text in x_test:
        tmp = list()
        for tfidf in text:
            tmp.append(tfidf)
        test.append(sum(tmp))
        
    scores = list()
    for x in test:
        tmp = list()
        for y in train:
            tmp.append(abs(x - y))
        scores.append(tmp)

    result = list()
    for idx in range(len(scores)):
        nearests = np.argsort(scores[idx])[:k]
        nearest_lables = list()
        for n in nearests:
            nearest_lables.append(y_train[n])
        tmp = stats.mode(nearest_lables)[0]
        result.append(tmp)
    return result

## Classifier Train & Comparison & Evaluation

#### KNN with CosineSimilarities (main words):

In [18]:
best_acc = 0.0
for k in range(2, 50):
    y_pred = cos_sim_knn(k, x_train_vector, y_train, x_test_vector)
    acc = balanced_accuracy_score(y_test, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
print(f'KNN with Cosine Similarities\nbest k:{best_k}\naccuracy:{best_acc}\n')
y_pred = cos_sim_knn(best_k, x_train_vector, y_train, x_test_vector)
print(classification_report(y_test, y_pred))

KNN with Cosine Similarities
best k:3
accuracy:0.955

              precision    recall  f1-score   support

         ham       0.95      0.95      0.95       200
        spam       0.95      0.95      0.95       200

    accuracy                           0.95       400
   macro avg       0.95      0.95      0.95       400
weighted avg       0.95      0.95      0.95       400



#### KNN with TFIDF Score (main words):

In [19]:
best_acc = 0.0
for k in range(2, 50):
    y_pred = tfidf_knn(k, x_train_vector, y_train, x_test_vector)
    acc = balanced_accuracy_score(y_test, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
print(f'KNN with TFIDF Score\nbest k:{best_k}\naccuracy:{best_acc}\n')
y_pred = tfidf_knn(best_k, x_train_vector, y_train, x_test_vector)
print(classification_report(y_test, y_pred))

KNN with TFIDF Score
best k:5
accuracy:0.6325000000000001

              precision    recall  f1-score   support

         ham       0.60      0.78      0.68       200
        spam       0.69      0.49      0.57       200

    accuracy                           0.63       400
   macro avg       0.64      0.63      0.62       400
weighted avg       0.64      0.63      0.62       400



#### KNN with CosineSimilarities (chi2 - best words):

In [20]:
best_acc = 0.0
for k in range(2, 50):
    y_pred = cos_sim_knn(k, x_train_vector_best, y_train, x_test_vector_best)
    acc = balanced_accuracy_score(y_test, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
print(f'KNN with Cosine Similarities\nbest k:{best_k}\naccuracy:{best_acc}\n')
y_pred = cos_sim_knn(best_k, x_train_vector_best, y_train, x_test_vector_best)
print(classification_report(y_test, y_pred))

KNN with Cosine Similarities
best k:4
accuracy:0.94

              precision    recall  f1-score   support

         ham       0.92      0.96      0.94       200
        spam       0.96      0.92      0.94       200

    accuracy                           0.94       400
   macro avg       0.94      0.94      0.94       400
weighted avg       0.94      0.94      0.94       400



#### KNN with TFIDF Score (chi2 - best words):

In [21]:
best_acc = 0.0
for k in range(2, 50):
    y_pred = tfidf_knn(k, x_train_vector_best, y_train, x_test_vector_best)
    acc = balanced_accuracy_score(y_test, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
print(f'KNN with TFIDF Score\nbest k:{best_k}\naccuracy:{best_acc}\n')
y_pred = tfidf_knn(best_k, x_train_vector_best, y_train, x_test_vector_best)
print(classification_report(y_test, y_pred))

KNN with TFIDF Score
best k:43
accuracy:0.7425

              precision    recall  f1-score   support

         ham       0.68      0.93      0.78       200
        spam       0.88      0.56      0.69       200

    accuracy                           0.74       400
   macro avg       0.78      0.74      0.73       400
weighted avg       0.78      0.74      0.73       400

