# Challenge 2 : phishing websites detection

Common imports :

In [2]:
import pandas as pd

## Data Analysis

In [3]:
df = pd.read_csv('PhiUSIIL_Phishing_URL_Dataset.csv')
df = df.drop(columns=['URLSimilarityIndex', 'CharContinuationRate', 'URLTitleMatchScore', 'URLCharProb',
'TLDLegitimateProb'])
df

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,TLDLength,NoOfSubDomain,HasObfuscation,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,3,1,0,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,2,1,0,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,2,2,0,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,3,1,0,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,3,1,0,...,1,0,1,244,15,34,72,1,85,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235790,660997.txt,https://www.skincareliving.com,29,www.skincareliving.com,22,0,com,3,1,0,...,1,0,1,51,7,21,187,2,191,1
235791,77185.txt,https://www.winchester.gov.uk,28,www.winchester.gov.uk,21,0,uk,2,2,0,...,1,0,0,50,1,7,88,0,31,1
235792,622132.txt,https://www.nononsensedesign.be,30,www.nononsensedesign.be,23,0,be,2,1,0,...,0,0,1,27,10,30,58,2,67,1
235793,7503962.txt,https://patient-cell-40f5.updatedlogmylogin.wo...,55,patient-cell-40f5.updatedlogmylogin.workers.dev,47,0,dev,3,2,0,...,0,0,0,0,0,3,0,0,0,0


Let's check if there are some missing values:

In [7]:
missing_values = df.isnull().mean()
print(missing_values[missing_values>0])

Series([], dtype: float64)


No missing values ! Now let's see the types of data we are facing to:

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235795 entries, 0 to 235794
Data columns (total 51 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   FILENAME                    235795 non-null  object 
 1   URL                         235795 non-null  object 
 2   URLLength                   235795 non-null  int64  
 3   Domain                      235795 non-null  object 
 4   DomainLength                235795 non-null  int64  
 5   IsDomainIP                  235795 non-null  int64  
 6   TLD                         235795 non-null  object 
 7   TLDLength                   235795 non-null  int64  
 8   NoOfSubDomain               235795 non-null  int64  
 9   HasObfuscation              235795 non-null  int64  
 10  NoOfObfuscatedChar          235795 non-null  int64  
 11  ObfuscationRatio            235795 non-null  float64
 12  NoOfLettersInURL            235795 non-null  int64  
 13  LetterRatioInU

In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
URLLength,235795.0,34.573095,41.314153,13.0,23.0,27.0,34.0,6097.0
DomainLength,235795.0,21.470396,9.150793,4.0,16.0,20.0,24.0,110.0
IsDomainIP,235795.0,0.002706,0.051946,0.0,0.0,0.0,0.0,1.0
TLDLength,235795.0,2.764456,0.599739,2.0,2.0,3.0,3.0,13.0
NoOfSubDomain,235795.0,1.164758,0.600969,0.0,1.0,1.0,1.0,10.0
HasObfuscation,235795.0,0.002057,0.045306,0.0,0.0,0.0,0.0,1.0
NoOfObfuscatedChar,235795.0,0.024861,1.876249,0.0,0.0,0.0,0.0,447.0
ObfuscationRatio,235795.0,0.000138,0.003817,0.0,0.0,0.0,0.0,0.348
NoOfLettersInURL,235795.0,19.428919,29.09033,0.0,10.0,14.0,20.0,5191.0
LetterRatioInURL,235795.0,0.515946,0.123315,0.0,0.435,0.519,0.594,0.926


In [7]:
categories = {}
var = {}
data = pd.DataFrame()

for col in df.select_dtypes(include='object').columns:
    categories[col] = df[col].unique().tolist()
    data[col + '_encoded'] = df[col].astype('category').cat.codes
    variance = data[col + '_encoded'].var()
    var[col] = variance

for col, cat in categories.items():
    print(f"Column '{col}' : {len(cat)} categories and var = {var[col]}")

Column 'FILENAME' : 235795 categories and var = 4633293151.666667
Column 'URL' : 235370 categories and var = 4616831238.329681
Column 'Domain' : 220086 categories and var = 4232909214.796515
Column 'TLD' : 695 categories and var = 20926.599272719683
Column 'Title' : 197874 categories and var = 3983227671.848595


## Test de chi2

In [5]:
from scipy.stats import chi2_contingency

# Sélection des colonnes catégorielles (colonnes de type entier, sauf la variable cible 'label')
categorical_columns = [col for col in df.select_dtypes(include=['int']).columns if col != 'label']

# Effectuer le test de chi-deux pour chaque variable catégorielle par rapport à 'label'
chi2_results = {}

for col in categorical_columns:
    # Création d'une table de contingence entre la variable et la cible
    contingency_table = pd.crosstab(df[col], df['label'])
    
    # Test de chi-deux
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Stockage des résultats
    chi2_results[col] = {'chi2': chi2, 'p_value': p, 'dof': dof, 'expected_freq': expected}

# Affichage des résultats
for variable, result in chi2_results.items():
    print(f"Variable: {variable}")
    print(f"  Chi2: {result['chi2']}")
    print(f"  p-value: {result['p_value']}")
    print(f"  Degrees of Freedom: {result['dof']}")
    print(f"  Expected Frequencies:\n{result['expected_freq']}\n")


Variable: URLLength
  Chi2: 67407.06168244261
  p-value: 0.0
  Degrees of Freedom: 481
  Expected Frequencies:
[[2.14052461e+00 2.85947539e+00]
 [2.18333510e+01 2.91666490e+01]
 [1.01032762e+02 1.34967238e+02]
 [2.08058992e+02 2.77941008e+02]
 [8.27526814e+02 1.10547319e+03]
 [1.60539346e+03 2.14460654e+03]
 [2.78268199e+03 3.71731801e+03]
 [3.68255854e+03 4.91944146e+03]
 [4.83501699e+03 6.45898301e+03]
 [5.49387046e+03 7.33912954e+03]
 [5.92026296e+03 7.90873704e+03]
 [6.21394294e+03 8.30105706e+03]
 [6.28029920e+03 8.38970080e+03]
 [6.66174069e+03 8.89925931e+03]
 [5.99817806e+03 8.01282194e+03]
 [5.49387046e+03 7.33912954e+03]
 [4.77208556e+03 6.37491444e+03]
 [4.31572572e+03 5.76527428e+03]
 [3.58666303e+03 4.79133697e+03]
 [3.01813970e+03 4.03186030e+03]
 [2.69577669e+03 3.60122331e+03]
 [2.56392038e+03 3.42507962e+03]
 [1.85369431e+03 2.47630569e+03]
 [1.66361573e+03 2.22238427e+03]
 [1.50949795e+03 2.01650205e+03]
 [1.15888002e+03 1.54811998e+03]
 [9.91490999e+02 1.32450900e+03

## Support Vector Machine

In [12]:
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report


# Préparation des données (conversion des colonnes non numériques)

# Identification des colonnes non numériques et application du Label Encoding
non_numeric_columns = [col for col in df.columns if df[col].dtype == 'object']
for col in non_numeric_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Séparation des caractéristiques et de la variable cible
X = df.drop('label', axis=1)
y = df['label']

# Normalisation des données
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Validation croisée avec KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)
clf = svm.SVC(kernel='linear', C=1)  # Choix du noyau et du paramètre C

# Variables pour stocker les résultats de chaque itération
accuracies = []
reports = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Entraînement du modèle
    clf.fit(X_train, y_train)
    
    # Prédiction et évaluation
    y_pred = clf.predict(X_test)
    accuracies.append(clf.score(X_test, y_test))
    reports.append(classification_report(y_test, y_pred, output_dict=True))

# Calcul des scores moyens de la validation croisée
average_accuracy = np.mean(accuracies)
print(f"Accuracy moyenne sur les 10-folds: {average_accuracy}")


Accuracy moyenne sur les 10-folds: 0.9998261204164471
