# Lectura de data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = '/kaggle/input/finals/data categorizada.xlsx'
df = pd.read_excel(data)
print(df.head())

In [None]:
#Eliminar filas con NaN
df = df.dropna()
filas = df.shape[0]
conteo = df.groupby('APT Verdict').size()
conteo

# Limpieza de texto

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import re


def limpiar(texto):
    texto_sincaracter = re.sub(r'[^a-zA-Z\s]', '',str(texto))
    texto_min = texto_sincaracter.lower()
    soup = BeautifulSoup(texto_min, 'html.parser')
    texto_sinhtml = soup.get_text()
    texto_limpio = ' '.join(texto_sinhtml.split())
    return texto_limpio

df['Subject_limpio'] = df['Subject'].apply(limpiar)

print(df['Subject_limpio'])

In [None]:
df = df.drop('Subject', axis=1)
ordenar = ['Sender IP', 'From (SMTP)', 'From (Header)', 'Subject_limpio', 'Has Attachment', 'Size', 'APT Verdict']
df = df[ordenar]
print(df.head())

# Tokenización

In [None]:
from nltk.tokenize import word_tokenize
import spacy

# Aplica la tokenización a la columna 'Subject_limpio'
df['Tokens'] = df['Subject_limpio'].apply(word_tokenize)
print(df.head())

In [None]:
df = df.drop('Subject_limpio', axis=1)
ordenar = ['Sender IP', 'From (SMTP)', 'From (Header)', 'Tokens', 'Has Attachment', 'Size', 'APT Verdict']
df = df[ordenar]
print(df.head())

# Eliminar stop words

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))

def eliminar_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stop_words]

df['Tokens_limpios'] = df['Tokens'].apply(eliminar_stopwords)
print(df.head())

In [None]:
df = df.drop('Tokens', axis=1)
ordenar = ['Sender IP', 'From (SMTP)', 'From (Header)', 'Tokens_limpios', 'Has Attachment', 'Size', 'APT Verdict']
df = df[ordenar]
print(df.head())

# Lematización

In [None]:
!pip install spacy
!python -m spacy download es_core_news_sm

In [None]:
import spacy

# Carga el modelo de spaCy en español
nlp = spacy.load("es_core_news_sm")

def lematizar(tokens):
    lematizar = []
    for token in tokens:
        doc = nlp(token)
        lematizar.append(doc[0].lemma_)
    return lematizar

df['Tokens_lematizados'] = df['Tokens_limpios'].apply(lematizar)
print(df.head())

In [None]:
df = df.drop('Tokens_limpios', axis=1)
ordenar = ['Sender IP', 'From (SMTP)', 'From (Header)', 'Tokens_lematizados', 'Has Attachment', 'Size', 'APT Verdict']
df = df[ordenar]
print(df.head())

In [None]:
#Eliminar las filas de Tokens_lematizados que tienen vectores vacios (=[])
df = df[df['Tokens_lematizados'].apply(lambda x: len(x) > 0)]
df.shape[0]

# Word2Vec

## SKIP-GRAM

In [None]:
df_w2v_skip = df.copy()
print(df_w2v_skip.head())

In [None]:
import pandas as pd
from gensim.models import Word2Vec

# Construir el modelo Word2Vec y construir el vocabulario
model = Word2Vec(sentences=df_w2v_skip['Tokens_lematizados'], vector_size=50, sg=1, window=5, min_count=1, workers=4)

# Entrenar el modelo Word2Vec
model.train(df_w2v_skip['Tokens_lematizados'], total_examples=len(df_w2v_skip['Tokens_lematizados']), epochs=10)

# Función para obtener el vector de una lista de tokens
def get_vector(tokens):
    return model.wv[set(tokens) & set(model.wv.index_to_key)]

# Función para obtener el vector promedio de una lista de tokens
def get_average_vector(tokens):
    vector_sum = np.zeros(50)  # Inicializar un vector de ceros con la misma dimensión que los vectores de palabras
    num_vectors = 0
    for token in tokens:
        if token in model.wv:
            vector_sum += model.wv[token]
            num_vectors += 1
    if num_vectors > 0:
        return vector_sum / num_vectors
    else:
        return vector_sum

# Aplicar la función a cada fila del DataFrame
df_w2v_skip['w2v_skip'] = df_w2v_skip['Tokens_lematizados'].apply(get_average_vector)

# Imprimir el DataFrame resultante
print(df_w2v_skip)

In [None]:
df_w2v_skip = df_w2v_skip.drop('Tokens_lematizados', axis=1)
ordenar = ['Sender IP', 'From (SMTP)', 'From (Header)', 'w2v_skip', 'Has Attachment', 'Size', 'APT Verdict']
df_w2v_skip = df_w2v_skip[ordenar]
print(df_w2v_skip.head())

In [None]:
import pandas as pd
import numpy as np

# Definir una función para convertir un vector NumPy en una lista
def numpy_vector_to_list(vector):
    return vector.tolist()

# Aplicar la función a la columna y crear una nueva columna de listas
df_w2v_skip['w2v_skip'] = df_w2v_skip['w2v_skip'].apply(numpy_vector_to_list)

# Mostrar el DataFrame resultante
print(df_w2v_skip)

In [None]:
# Función para agregar números de IP a vectores
def agregar_ip_a_vector(ip1, ip2, ip3, vector):
    numeros_ip1 = [float(numero) for numero in ip1.split('.')]
    numeros_ip2 = [float(numero) for numero in ip2.split('.')]
    numeros_ip3 = [float(numero) for numero in ip3.split('.')]
    
    nuevo_vector = vector + numeros_ip1 + numeros_ip2 + numeros_ip3
    return nuevo_vector

# Aplicar la función a cada fila del DataFrame
df_w2v_skip['Concatenado_ips'] = df_w2v_skip.apply(lambda row: agregar_ip_a_vector(row['Sender IP'], row['From (SMTP)'], row['From (Header)'], row['w2v_skip']), axis=1)

# Función para agregar enteros y floats como elementos individuales en los vectores
def agregar_numero_a_vector(vector, numero):
    nuevo_vector = vector + [float(numero)]
    return nuevo_vector

# Aplicar la función a cada fila del DataFrame para enteros
df_w2v_skip['Concatenado_total'] = df_w2v_skip.apply(lambda row: agregar_numero_a_vector(row['Concatenado_ips'], row['Has Attachment']), axis=1)

# Aplicar la función a cada fila del DataFrame para floats
df_w2v_skip['Concatenado_total'] = df_w2v_skip.apply(lambda row: agregar_numero_a_vector(row['Concatenado_total'], row['Size']), axis=1)

# Mostrar el DataFrame resultante
print(df_w2v_skip.head())

In [None]:
first_row = df_w2v_skip["Concatenado_total"].loc[1]
print(first_row)

In [None]:
# Reemplazar "no phishing" por 0 y "phishing" por 1
df_w2v_skip['APT Verdict'] = df_w2v_skip['APT Verdict'].replace({'no phishing': 0, 'phishing': 1})
print(df_w2v_skip.head())

### RANDOM FOREST

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

# Separar las características (vectores) y las etiquetas
X = df_w2v_skip['Concatenado_total'].tolist()
y = df_w2v_skip['APT Verdict']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2023)

# Definir los hiperparámetros a sintonizar
param_grid = {
    'n_estimators': [100, 200, 300],  # Número de árboles en el bosque
    'max_depth': [None, 5, 10],  # Profundidad máxima de cada árbol
    'min_samples_split': [2, 5, 10],  # Número mínimo de muestras requeridas para dividir un nodo
    'min_samples_leaf': [1, 2, 4]  # Número mínimo de muestras requeridas en un nodo hoja
}

# Crear un modelo de Random Forest
rf_classifier = RandomForestClassifier(random_state=2023)

# Realizar la búsqueda de cuadrícula
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo con los hiperparámetros sintonizados
best_model = grid_search.best_estimator_

# Realizar predicciones en el conjunto de prueba con el mejor modelo
y_pred = best_model.predict(X_test)

# Calcular el accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Calcular la matriz de confusión
confusion = confusion_matrix(y_test, y_pred)

# Calcular la precisión
precision = precision_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision}')

# Calcular el recall
recall = recall_score(y_test, y_pred, average='weighted')
print(f'Recall: {recall}')

# Calcular el F-score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F-Score: {f1}')

# Calcular la especificidad
tn, fp, fn, tp = confusion.ravel()
specificity = tn / (tn + fp)
print(f'Specificity: {specificity}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Configurar el gráfico
plt.figure(figsize=(6, 4))
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Clase Negativa', 'Clase Positiva'],
            yticklabels=['Clase Negativa', 'Clase Positiva'])

plt.xlabel('Predicción')
plt.ylabel('Valor Real')
plt.title('Matriz de Confusión')

# Mostrar el gráfico
plt.show()

# FIN DEL CÓDIGO

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = '/kaggle/input/data-simulando/data original simulacion.xlsx'
df = pd.read_excel(data)
df

In [None]:
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import spacy
import os
import nltk
from nltk.corpus import stopwords
import pandas as pd
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

def predecir(df):
    
    #Eliminar filas con NaN
    df = df.dropna()
    conteo = df.groupby('APT Verdict').size()
    print(conteo)
    
    # Realizar Limpieza de texto
    def limpiar(texto):
        texto_sincaracter = re.sub(r'[^a-zA-Z\s]', '',str(texto))
        texto_min = texto_sincaracter.lower()
        soup = BeautifulSoup(texto_min, 'html.parser')
        texto_sinhtml = soup.get_text()
        texto_limpio = ' '.join(texto_sinhtml.split())
        return texto_limpio
    
    df['Subject'] = df['Subject'].apply(limpiar)

    # Realizar Tokenización
    df['Subject'] = df['Subject'].apply(word_tokenize)
    
    # Realizar Eliminación de stop words
    nltk.download('stopwords')
    stop_words = set(stopwords.words('spanish'))
    def eliminar_stopwords(tokens):
        return [token for token in tokens if token.lower() not in stop_words]
    
    df['Subject'] = df['Subject'].apply(eliminar_stopwords)
    
    # Realizar Lematización
    #!pip install spacy
    #!python -m spacy download es_core_news_sm
    nlp = spacy.load("es_core_news_sm")

    def lematizar(tokens):
        lematizar = []
        for token in tokens:
            doc = nlp(token)
            lematizar.append(doc[0].lemma_)
        return lematizar
    
    df['Subject'] = df['Subject'].apply(lematizar)
    
    #Eliminar las filas de Tokens_lematizados que tienen vectores vacios (=[])
    df = df[df['Subject'].apply(lambda x: len(x) > 0)]
    conteo2 = df.groupby('APT Verdict').size()
    print(conteo2)
    
    # Modelo Word2Vec
    model = Word2Vec(sentences=df['Subject'], vector_size=50, sg=1, window=5, min_count=1, workers=4)
    model.train(df['Subject'], total_examples=len(df['Subject']), epochs=10)
    
    def get_average_vector(tokens):
        vector_sum = np.zeros(50)
        num_vectors = 0
        for token in tokens:
            if token in model.wv:
                vector_sum += model.wv[token]
                num_vectors += 1
        if num_vectors > 0:
            return vector_sum / num_vectors
        else:
            return vector_sum
        
    df['Subject'] = df['Subject'].apply(get_average_vector)

    def numpy_vector_to_list(vector):
        return vector.tolist()
    
    df['Subject'] = df['Subject'].apply(numpy_vector_to_list)
    
    def agregar_ip_a_vector(ip1, ip2, ip3, vector):
        numeros_ip1 = [float(numero) for numero in ip1.split('.')]
        numeros_ip2 = [float(numero) for numero in ip2.split('.')]
        numeros_ip3 = [float(numero) for numero in ip3.split('.')]

        nuevo_vector = vector + numeros_ip1 + numeros_ip2 + numeros_ip3
        return nuevo_vector

    df['Concatenado_total'] = df.apply(lambda row: agregar_ip_a_vector(row['Sender IP'], row['From (SMTP)'], row['From (Header)'], row['Subject']), axis=1)

    def agregar_numero_a_vector(vector, numero):
        nuevo_vector = vector + [float(numero)]
        return nuevo_vector
    
    df['Concatenado_total'] = df.apply(lambda row: agregar_numero_a_vector(row['Concatenado_total'], row['Has Attachment']), axis=1)
    df['Concatenado_total'] = df.apply(lambda row: agregar_numero_a_vector(row['Concatenado_total'], row['Size']), axis=1)
    
    df['APT Verdict'] = df['APT Verdict'].replace({'no phishing': 0, 'phishing': 1})
    
    # Predecir con modelo Random Forest
    X_input = df['Concatenado_total'].tolist()
    y_real = df['APT Verdict']
    
    y_pred = best_model.predict(X_input)

    # Imprime la predicción
    print("Predicción:", y_pred)
    
    # Calcular el accuracy
    accuracy = accuracy_score(y_real, y_pred)
    print(f'Accuracy: {accuracy}')

    # Calcular la matriz de confusión
    confusion = confusion_matrix(y_real, y_pred)

    # Calcular la precisión
    precision = precision_score(y_real, y_pred, average='weighted')
    print(f'Precision: {precision}')

    # Calcular el recall
    recall = recall_score(y_real, y_pred, average='weighted')
    print(f'Recall: {recall}')

    # Calcular el F-score
    f1 = f1_score(y_real, y_pred, average='weighted')
    print(f'F-Score: {f1}')

    # Calcular la especificidad
    tn, fp, fn, tp = confusion.ravel()
    specificity = tn / (tn + fp)
    print(f'Specificity: {specificity}')

In [None]:
predecir(df)