In [1]:
! pip install -r ../requirements.txt

Collecting sqlalchemy==1.4.37
  Using cached SQLAlchemy-1.4.37-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
Collecting pymysql==1.0.2
  Using cached PyMySQL-1.0.2-py3-none-any.whl (43 kB)
Collecting pandas
  Downloading pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting python-dotenv==0.20.0
  Using cached python_dotenv-0.20.0-py3-none-any.whl (17 kB)
Collecting psycopg2-binary==2.9.3
  Using cached psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Collecting requests==2.27.1
  Using cached requests-2.27.1-py2.py3-none-any.whl (63 kB)
Collecting numpy>=1.18.5
  Using cached numpy-1.23.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Collecting opencv-python>=4.1.2
  Using cached opencv_pyt

In [11]:
import pandas as pd
import pickle
import numpy as np
import re
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


In [12]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')

In [13]:
df_raw.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [14]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [15]:
df_raw['is_spam'].value_counts()

False    2303
True      696
Name: is_spam, dtype: int64

In [16]:
df = df_raw.copy()

In [17]:
df = df.drop_duplicates().reset_index(drop = True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2369 entries, 0 to 2368
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2369 non-null   object
 1   is_spam  2369 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 20.9+ KB


In [19]:
def comas(text):
    """
    Elimina comas del texto
    """
    return re.sub(',', ' ', text)

def espacios(text):
    """
    Elimina enters dobles por un solo enter
    """
    return re.sub(r'(\n{2,})','\n', text)

def minuscula(text):
    """
    Cambia mayusculas a minusculas
    """
    return text.lower()

def numeros(text):
    """
    Sustituye los numeros
    """
    return re.sub('([\d]+)', ' ', text)

def caracteres_no_alfanumericos(text):
    """
    Sustituye caracteres raros, no digitos y letras
    Ej. hola 'pepito' como le va? -> hola pepito como le va
    """
    return re.sub("(\\W)+"," ",text)

def comillas(text):
    """
    Sustituye comillas por un espacio
    Ej. hola 'pepito' como le va? -> hola pepito como le va?
    """
    return re.sub("'"," ", text)

def palabras_repetidas(text):
    """
    Sustituye palabras repetidas

    Ej. hola hola, como les va? a a ustedes -> hola, como les va? a ustedes
    """
    return re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

def esp_multiple(text):
    """
    Sustituye los espacios dobles entre palabras
    """
    return re.sub(' +', ' ',text)


In [20]:
# funcón para eliminar https
def url(text):
    return re.sub(r'(https://www|https://)', '', text)

In [21]:
# se limpia url
df['url_limpia'] = df['url'].apply(url).apply(caracteres_no_alfanumericos).apply(esp_multiple)

In [22]:
df.head()

Unnamed: 0,url,is_spam,url_limpia
0,https://briefingday.us8.list-manage.com/unsubs...,True,briefingday us8 list manage com unsubscribe
1,https://www.hvper.com/,True,hvper com
2,https://briefingday.com/m/v4n3i4f3,True,briefingday com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,False,briefingday com n 20200618 m commentform
4,https://briefingday.com/fan,True,briefingday com fan


In [23]:
df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x == True else 0)

In [24]:
df.head()

Unnamed: 0,url,is_spam,url_limpia
0,https://briefingday.us8.list-manage.com/unsubs...,1,briefingday us8 list manage com unsubscribe
1,https://www.hvper.com/,1,hvper com
2,https://briefingday.com/m/v4n3i4f3,1,briefingday com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,0,briefingday com n 20200618 m commentform
4,https://briefingday.com/fan,1,briefingday com fan


In [None]:
# NLP techniques for data preprocessing

In [25]:
vec = CountVectorizer().fit_transform(df['url_limpia'])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(vec, df['is_spam'], stratify = df['is_spam'], random_state = 2207)

In [None]:
# SVM for creating an url spam classifier

In [27]:
classifier = SVC(C = 1.0, kernel = 'linear', gamma = 'auto')

In [28]:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       532
           1       0.71      0.64      0.67        61

    accuracy                           0.94       593
   macro avg       0.83      0.80      0.82       593
weighted avg       0.93      0.94      0.93       593



In [29]:
# optimizing hyperparameters
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(random_state=1234),param_grid,verbose=2)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.2s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.1s
[CV] END .....................C=0.1, gamma=1, k

In [30]:
grid.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [31]:
grid.best_estimator_

In [32]:
predictions = grid.best_estimator_.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       532
           1       0.82      0.61      0.70        61

    accuracy                           0.95       593
   macro avg       0.89      0.80      0.83       593
weighted avg       0.94      0.95      0.94       593



In [35]:
filename = '../models/best_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9460370994940978
