In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("data.csv", on_bad_lines='skip')

In [4]:
df

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
...,...,...
669635,10redtux10,1
669636,infrared1,1
669637,184520socram,1
669638,marken22a,1


In [5]:
df.describe()

Unnamed: 0,strength
count,669640.0
mean,0.990196
std,0.507948
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,2.0


In [6]:
df.isnull().sum()

Unnamed: 0,0
password,1
strength,0


In [7]:
df.dropna(inplace= True)

In [8]:
df.isnull().sum()

Unnamed: 0,0
password,0
strength,0


In [9]:
def extract_features(password: str):
    features = {
        'length': len(password),
        'uppercase': sum(1 for char in password if char.isupper()),
        'lowercase': sum(1 for char in password if char.islower()),
        'digits': sum(1 for char in password if char.isdigit()),
        'special_chars': sum(1 for char in password if not char.isalnum())
    }
    return features

In [10]:
features_df = pd.DataFrame([{'password': df['password'].iloc[i], 'strength': df['strength'].iloc[i], **extract_features(df['password'].iloc[i])} for i in range(len(df))])
features_df.head(10)

Unnamed: 0,password,strength,length,uppercase,lowercase,digits,special_chars
0,kzde5577,1,8,0,4,4,0
1,kino3434,1,8,0,4,4,0
2,visi7k1yr,1,9,0,7,2,0
3,megzy123,1,8,0,5,3,0
4,lamborghin1,1,11,0,10,1,0
5,AVYq1lDE4MgAZfNt,2,16,9,5,2,0
6,u6c8vhow,1,8,0,6,2,0
7,v1118714,1,8,0,1,7,0
8,universe2908,1,12,0,8,4,0
9,as326159,1,8,0,2,6,0


In [12]:
x = np.array(df["password"])#convert to tuple array
y = np.array(df["strength"])

In [13]:
x

array(['kzde5577', 'kino3434', 'visi7k1yr', ..., '184520socram',
       'marken22a', 'fxx4pw4g'], dtype=object)

Using TF-IDF vectorizer to convert String data into numerical data

In [20]:
tdif = TfidfVectorizer()
X = tdif.fit_transform(x)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2 , random_state=42)
X_train.shape

(535711, 672542)

XGBoost

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

XGBoost Accuracy: 0.7458933158114808


Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.7447359775401708


SVM


In [23]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(C=1.0, random_state=42)
linear_svc.fit(X_train, y_train)
y_pred_Lsvc = nb.predict(X_test)

print("linearSVS Accuracy:", accuracy_score(y_test, y_pred_Lsvc))




linearSVS Accuracy: 0.7447359775401708


In [26]:
from sklearn.metrics import precision_recall_fscore_support, average_precision_score

In [27]:
from sklearn.preprocessing import label_binarize

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.preprocessing import label_binarize
y_bin = label_binarize(y, classes=np.unique(y))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "SVM (Linear)": LinearSVC(dual=False),  # dual=False for n_samples > n_features
    "Random Forest": RandomForestClassifier()
}

# Evaluate each model
for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    # Metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

    # Weighted Average Precision (WAP)
    if y_proba is not None:
        wap = average_precision_score(
            label_binarize(y_test, classes=np.unique(y)),
            y_proba,
            average='weighted'
        )
    else:
        wap = "Not available (SVM outputs decision function)"

    # Print results
    print(f"\n{name}:")
    print(f"  Precision (weighted): {precision:.3f}")
    print(f"  Recall (weighted): {recall:.3f}")
    print(f"  F1-score (weighted): {f1:.3f}")
    print(f"  Weighted Avg Precision (WAP): {wap}")



XGBoost:
  Precision (weighted): 0.742
  Recall (weighted): 0.746
  F1-score (weighted): 0.641
  Weighted Avg Precision (WAP): 0.5915881193327621

SVM (Linear):
  Precision (weighted): 0.748
  Recall (weighted): 0.747
  F1-score (weighted): 0.645
  Weighted Avg Precision (WAP): Not available (SVM outputs decision function)
