# Creare un sistema di Spam Detection

L'azienda ProfessionAI vuole realizzare una libreria capace di fare analisi delle email ricevute.

Nello specifico, il CEO ha richiesto di identificare le email di tipo SPAM sulle quali fare analisi contenutistiche.

Il CTO nello specifico ti fornisce un dataset e ti chiede di:
- Addestrare un classificatore per identificare SPAM
- Individuare i Topic principali tra le email SPAM presenti nel dataset
- Calcolare la distanza semantica tra i topics ottenuti, per dedurne l'eterogeneità.
- Estrarre dalle mail NON SPAM le Organizzazioni presenti.

# 1) Addestrare un classificatore per identificare SPAM

## /

In [None]:
import pandas as pd

In [None]:
URL = "https://raw.githubusercontent.com/ProfAI/natural-language-processing/main/datasets/Verifica%20Finale%20-%20Spam%20Detection/"

In [None]:
dataframe = pd.read_csv(URL+"spam_dataset.csv")

In [None]:
dataframe.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\nth...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0
2,3624,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\nthis deal is to ...,0


In [None]:
dataframe.shape

(5171, 4)

In [None]:
dataframe["label_num"].isnull().sum(axis = 0)

0

In [None]:
# count the number of unique values in unnamed column
dataframe["Unnamed: 0"].size

5171

In [None]:
# droppo la colonna popolata da valori unici in quanto informazioni non rilevanti ai fini del modello
dataframe = dataframe.drop("Unnamed: 0", axis=1)

In [None]:
dataframe.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\nth...,0
1,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0
2,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\nthis deal is to ...,0


In [None]:
# dataset sbilanciato?
from collections import Counter
print(Counter(dataframe["label_num"]))
print(str(round(3672/5171*100))+"%")
print(str(round(1499/5171*100))+"%")

Counter({0: 3672, 1: 1499})
71%
29%


dataset leggermente sbilanciato,

provare in seguito class_weight="balanced" nella funzione di attivazione Regressione Logistica prima di eventuali under/overfitting

## Data cleaning

In [None]:
import spacy

# importo la punteggiatura da rimuovere
import string

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# per gestire numeri e spazi multipli
import re

from sklearn.feature_extraction.text import TfidfVectorizer

# importo le stopwords in lingua inglese
english_stopwords = stopwords.words("english")

# estendo le stopwords anche alla parola "Subject"
english_stopwords.extend(['subject'])

# importo il modello nlp
nlp = spacy.load("en_core_web_sm")

punctuation = set(string.punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# data cleaning function

def data_cleaner(sentence):
    sentence = sentence.lower()
    for c in string.punctuation:
        sentence = sentence.replace(c, " ")
    document = nlp(sentence)
    sentence = ' '.join(token.lemma_ for token in document)
    sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
    sentence = re.sub('\d', '', sentence)

    return sentence

In [None]:
X = dataframe["text"]
X.shape

(5171,)

In [None]:
y = dataframe["label"]
y.shape

(5171,)

In [None]:
X_cleaned = []
for text in X:
    X_cleaned.append(data_cleaner(text))

In [None]:
X_cleaned[0]

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=.3)

## Tf-idf Vectorizer

In [None]:
# funzione

def bow_tfidf(dataset, tfidf_vectorizer):

    if tfidf_vectorizer == None:
        tfidf_vectorizer = TfidfVectorizer()
        X = tfidf_vectorizer.fit_transform(dataset)
    else:
        X = tfidf_vectorizer.transform(dataset)

    return X.toarray(), tfidf_vectorizer

In [None]:
X_train, vectorizer = bow_tfidf(X_train, None)

In [None]:
X_test, vectorizer = bow_tfidf(X_test, vectorizer)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Modelli

In [None]:
from sklearn.metrics import classification_report

### Regressione Logistica

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train, y_train)

In [None]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

### Regressione Logistica con class_weight="balanced"

In [None]:
lr_bal = LogisticRegression(class_weight= "balanced")

lr_bal.fit(X_train, y_train)

In [None]:
y_pred_train = lr_bal.predict(X_train)
y_pred_test = lr_bal.predict(X_test)
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

### MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier

# allo stato dell'arte la funzione di attivazione più utilizzata è logistic, quindi:
clf = MLPClassifier(activation= "logistic",
                    solver = "adam",
                    max_iter = 100, # num max di iterazioni per la fase di train
                    hidden_layer_sizes = (100,), # un solo layer
                    tol = 0.005,
                    verbose = True
                   )

clf.fit(X_train,y_train)

In [None]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

# 2) Individuare i Topic principali tra le email SPAM presenti nel dataset

In [None]:
dataframe = pd.read_csv(URL+"spam_dataset.csv")

In [None]:
dataframe_spam = dataframe[dataframe["label"]=="spam"]

In [None]:
dataframe_spam.head()

In [None]:
import gensim

In [None]:
from gensim.utils import simple_preprocess

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words("english")

In [None]:
documents = dataframe_spam["text"]

In [None]:
documents

In [None]:
def sent_to_words(items):
    for item in items:
        yield(simple_preprocess(item, deacc=True))
        # yield è un tipo di return speciale, che restituisce una lista
        # che contiene tutti i return del ciclo
        # deacc a True rimuove la punteggiatura

def remove_stopwords(texts):
    return [[word for word in words if word not in stop_words and len(word) >=5 and word != "subject"] for words in texts]

In [None]:
data_words = list(sent_to_words(documents))

In [None]:
data_words

In [None]:
data_words = remove_stopwords(data_words)

In [None]:
data_words

In [None]:
import gensim.corpora as corpora

In [None]:
# creo il dizionario passando la lista generata dallo step precedente
id2word = corpora.Dictionary(data_words)

In [None]:
# vettorizzo usando il metodo doc2bow
corpus = [id2word.doc2bow(text) for text in data_words]

In [None]:
corpus

## LDA con Gensim

In [None]:
# per print più formattati importo:
from pprint import pprint

In [None]:
num_topics = 3

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       passes=3
                                      )

In [None]:
pprint(lda_model.print_topics())

In [None]:
doc_lda = lda_model[corpus]

# 3) Calcolare la distanza semantica tra i topic ottenuti, per dedurne l'eterogeneità

In [None]:
from gensim.models import Word2Vec

In [None]:
import gensim.downloader
# con cui scaricare il modello pre-trained di word2vec:
glove_vectors = gensim.downloader.load("glove-wiki-gigaword-300")

In [None]:
# importo spatial per il calcolo della similarità semantica:
from scipy import spatial

In [None]:
import numpy as np

In [None]:
# funzione di average

def avg_vector(sentence):
    to_remove = 0
    vector = np.zeros(300)
    for word in sentence:
        if word in glove_vectors.key_to_index.keys():
            vector += glove_vectors.get_vector(word)
        else:
            to_remove += 1
    if len(sentence) == to_remove:
        return np.zeros(300)

    return vector/(len(sentence)-to_remove)

In [None]:
for t in lda_model.print_topics():
    with open ("topic_file.txt", "a+", encoding = "utf8") as topic_file:
        t = str(t)+"\n"
        topic_rec = topic_file.write(t)

In [None]:
with open("topic_file.txt", "r+", encoding = "utf8") as topic_file:
                topic_rec = topic_file.readlines()

emp_list = topic_rec

# Assegnazione delle nuvole di parole ai topic
t_1 = emp_list[0]
t_2 = emp_list[1]
t_3 = emp_list[-1]

In [None]:
# topic cleaning

topic_words_cleaned = list(sent_to_words(emp_list))
topic_words_cleaned = remove_stopwords(topic_words_cleaned)

print(topic_words_cleaned)
print(topic_words_cleaned[0])
print(topic_words_cleaned[1])
print(topic_words_cleaned[2])

In [None]:
topic_list_vect = []
for topic in topic_words_cleaned:
    topic_list_vect.append(avg_vector(topic))

topic_list_vect

In [None]:
topic_1_2 = 1 - spatial.distance.cosine(topic_list_vect[0], topic_list_vect[1])
topic_1_3 = 1 - spatial.distance.cosine(topic_list_vect[0], topic_list_vect[2])
topic_2_3 = 1 - spatial.distance.cosine(topic_list_vect[1], topic_list_vect[2])

print(f"Cosine similarity topic_1_2 : {topic_1_2}")
print(f"Cosine similarity topic_1_3 : {topic_1_3}")
print(f"Cosine similarity topic_2_3 : {topic_2_3}")

# 4) Estrarre dalle mail NON SPAM le Organizzazioni presenti

In [None]:
l = ["ham",]

dataset_ham = dataframe[dataframe["label"].isin(l)]
dataset_ham.drop(["Unnamed: 0", "label_num" ], axis=1, inplace=True)

dataset_ham.head()

In [None]:
X_ham = dataset_ham["text"]
X_ham.head()

In [None]:
X_ham_cleaned = []
for text in X_ham:
    X_ham_cleaned.append(data_cleaner(text))

In [None]:
X_ham_cleaned

In [None]:
def org_entitie(sentence):
    to_return = {"ORG":[]}
    doc = nlp(sentence)

    for token in doc:
        if str(token.ent_type_) == "ORG":
            to_return[str(token.ent_type_)].append(str(token))

    return to_return

In [None]:
for sentence in X_ham_cleaned:

    org_dict = (org_entitie(sentence))
    if org_dict != {"ORG":[]}:

                # Salvare le ORG su txt

        org_dict = str(org_dict)

        with open ("org_file.txt", "a+", encoding = "utf8") as org:
            org_rec = org.write(org_dict+"\n")

        print(org_dict)