---
title: Exploratory Data Analysis
description: ...
---

### Imports

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scapy.all as scapy
from pathlib import Path

### Loading data

In [28]:
data = Path('../data/data-en-hi-de-fr.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,labels,text,text_hi,text_de,text_fr
0,ham,"Go until jurong point, crazy.. Available only ...","Dakag बिंदु तक जाओ, पागल. केवल Bag Non महान वि...","Gehen Sie bis jurong Punkt, verrückt.. Verfügb...","Allez jusqu'à Jurong point, fou.. Disponible s..."
1,ham,Ok lar... Joking wif u oni...,ओके लामर.... if if uue पर.,Ok Lar... joking wif u oni...,J'ai fait une blague sur le wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Fktatatat 21 मई को प्राप्त करने के लिए मुफ्त प...,Freier Eintritt in 2 a wkly comp zum Gewinn FA...,Entrée libre dans 2 a wkly comp pour gagner FA...
3,ham,U dun say so early hor... U c already then say...,Uden इतना जल्दी कहते हैं... तो पहले से ही यूसी...,U dun sagen so früh... U c schon dann sagen...,U dun dit si tôt hor... U c déjà dire alors...
4,ham,"Nah I don't think he goes to usf, he lives aro...","नहीं, मुझे नहीं लगता कि वह हमारे लिए चला जाता ...","Nein, ich glaube nicht, dass er zu unsf geht, ...","Non, je ne pense pas qu'il va à usf, il vit da..."


In [29]:
df.labels.value_counts()

labels
ham     4825
spam     747
Name: count, dtype: int64

In [30]:
df = df[['labels', 'text_fr']]
df['labels'] = df['labels'].map({'ham': 0, 'spam': 1})
df = df.rename(columns={'text_fr': 'text', 'labels': 'is_spam'})

### Preprocessing

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

preprocessor = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('scaler', StandardScaler(with_mean=False))
])

### Model definition

In [32]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models = [
    {
        'model_name' : 'Naive Bayes',
        'model_filepath' : Path('naive_bayes_model.pkl'),
        'model_instance' : MultinomialNB(),
        'best_model': None,
        'best_params': None,
    },
    {
        'model_name' : 'Logistic Regression',
        'model_filepath' : Path('logistic_regression_model.pkl'),
        'model_instance' : LogisticRegression(max_iter=1000),
        'best_model': None,
        'best_params': None
    },
    {
        'model_name' : 'SVM',
        'model_filepath' : Path('svm_model.pkl'),
        'model_instance' : SVC(),
        'best_model': None,
        'best_params': None
    }
]

### Model training

In [34]:
# Perform Cross-Validation
for model_infos in models:
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model_infos['model_instance'])
    ])


    scores = cross_val_score(pipeline, df['text'], df['is_spam'], cv=5)
    print(f"{model_infos['model_name']} Accuracy: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

Naive Bayes Accuracy: 0.9639 (+/- 0.0051)
Logistic Regression Accuracy: 0.9711 (+/- 0.0045)
SVM Accuracy: 0.9374 (+/- 0.0015)


### Model evaluation