#### **NLP Project Tutorial**
##### **URL Spam detector**

**Instalar e importar librerías y cargar dataset**

In [2]:
%%capture
!pip install -r ../requirements.txt

In [3]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import model_selection, svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [16]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv'

df_raw = pd.read_csv(url)

**Primeras observaciones**

In [17]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [18]:
df_raw.sample(5)

Unnamed: 0,url,is_spam
1617,https://www.morningbrew.com/daily/r/,True
2222,https://www.businessinsider.com/air-traffic-du...,False
969,https://www.nytimes.com/interactive/2019/08/14...,False
2390,https://www.npr.org/2020/04/11/830390452/the-s...,False
1481,https://www.perininetworks.com/terms-conditions/,True


In [19]:
df_raw['is_spam'].value_counts()

False    2303
True      696
Name: is_spam, dtype: int64

**Limpieza y procesamiento**

In [20]:
df = df_raw.copy()

In [21]:
df.drop_duplicates()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
...,...,...
2993,https://www.theverge.com/2020/6/29/21306889/di...,False
2994,https://www.smartcitiesworld.net/news/news/dee...,False
2996,https://techcrunch.com/2019/07/04/an-optimisti...,False
2997,https://www.technologyreview.com/2019/12/20/13...,False


In [22]:
df['url'] = df['url'].str.lower()

In [23]:
def eliminar_https(texto):
    return re.sub(r'(https://www|https://)', '', texto)

In [24]:
# Sustituye caracteres no alfanuméricos
def caracteres_no_alfanumericos(texto):
    return re.sub("(\\W)+"," ", texto)

In [25]:
#Sustituye los espacios dobles entre palabras
def esp_multiple(texto):
    return re.sub(' +', ' ',texto)

In [26]:
df['url_limpia'] = df['url'].apply(eliminar_https).apply(caracteres_no_alfanumericos).apply(esp_multiple)

In [27]:
df.head()

Unnamed: 0,url,is_spam,url_limpia
0,https://briefingday.us8.list-manage.com/unsubs...,True,briefingday us8 list manage com unsubscribe
1,https://www.hvper.com/,True,hvper com
2,https://briefingday.com/m/v4n3i4f3,True,briefingday com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,False,briefingday com n 20200618 m commentform
4,https://briefingday.com/fan,True,briefingday com fan


In [28]:
df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x==True else 0)

In [29]:
df.head()

Unnamed: 0,url,is_spam,url_limpia
0,https://briefingday.us8.list-manage.com/unsubs...,1,briefingday us8 list manage com unsubscribe
1,https://www.hvper.com/,1,hvper com
2,https://briefingday.com/m/v4n3i4f3,1,briefingday com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,0,briefingday com n 20200618 m commentform
4,https://briefingday.com/fan,1,briefingday com fan


**Construcción y evaluación del modelo**

In [30]:
X = df['url_limpia']
y = df['is_spam']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=12)

In [32]:
vec = CountVectorizer()

X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

In [33]:
classifier = svm.SVC(C=1.0, kernel='linear', gamma='auto')

In [34]:
classifier.fit(X_train, y_train)

In [35]:
predictions_train = classifier.predict(X_train)

print('Train:')
print(classification_report(y_train, predictions_train))

Train:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1727
           1       1.00      0.99      0.99       522

    accuracy                           1.00      2249
   macro avg       1.00      0.99      1.00      2249
weighted avg       1.00      1.00      1.00      2249



In [36]:
predictions = classifier.predict(X_test)

print('Test:')
print(classification_report(y_test, predictions))

Test:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       576
           1       0.92      0.93      0.92       174

    accuracy                           0.96       750
   macro avg       0.95      0.95      0.95       750
weighted avg       0.96      0.96      0.96       750



**Búsqueda de hiperparámetros**

In [41]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1 , 0.01 , 0.001], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(random_state=1234), param_grid, verbose=2)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   4.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   3.9s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   4.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   3.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   3.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   8.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   9.8s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   9.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   9.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   9.1s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   5.6s
[CV] END ........................C=0.1, gamma=1

In [42]:
grid.best_estimator_

**Construcción del modelo con los hiperparámetros encontrados**

In [43]:
classifier_hp = SVC(C=10, gamma=0.1, random_state=1234)

In [44]:
classifier_hp.fit(X_train, y_train)

In [45]:
predictions_hp = classifier_hp.predict(X_test)

print(classification_report(y_test, predictions_hp))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       576
           1       0.91      0.94      0.92       174

    accuracy                           0.96       750
   macro avg       0.94      0.95      0.95       750
weighted avg       0.96      0.96      0.96       750

