## Basic names prediction ##

import neccessary libraries

In [156]:
import pandas as pd  # Library for data manipulation and analysis
import numpy as np  # Library for efficient numerical computation

np.random.seed(0)  # Set the random seed for reproducibility

from sklearn.dummy import DummyClassifier #  Library for dummy classifier, the simpliest model ;-)
from sklearn.model_selection import train_test_split  # Function to split data into training and testing sets
from sklearn.linear_model import LogisticRegression  # A linear model for classification tasks
from sklearn.tree import DecisionTreeClassifier  # A tree-based model for classification tasks

from sklearn.model_selection import train_test_split  # Function to split data into training and testing sets

from sklearn.metrics import accuracy_score  # Metric to evaluate the accuracy of a model
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay

import matplotlib.pyplot as plt  # Library for creating static, animated, and interactive visualizations

# Display plots directly in the notebook
%matplotlib inline  

import os
import requests
import json
import zipfile
import io
import urllib.request

In [155]:
base_url = 'https://api.dane.gov.pl/datasets/1667,lista-imion-wystepujacych-w-rejestrze-pesel-osoby-zyjace/resources/'

datasource_url = 'files/downloads/'
data_target_directory = '/Users/marcintubielewicz/Documents/programming/github_portfolio/python/machine-learning/basic_prediction/data'

if not os.path.exists(data_target_directory):
    os.makedirs(data_target_directory)
    
response = requests.get(base_url)

def download_url(url, data_target_directory):
    with urllib.request.urlopen(url) as dl_file:
        with open(data_target_directory, 'wb') as out_file:
            out_file.write(dl_file.read())

if response.status_code == 200:
    data = response.json()
    
    download_url = base_url + datasource_url
    
    with requests.get(download_url, stream=True) as download:
        if download.status_code == 200:

            zip_file = zipfile.ZipFile(io.BytesIO(download.content))
            # zip_file = 'Lista_imion_wystepujacych_w_rejestrze_PESEL'
            
            zip_file.extractall(data_target_directory)
            
            print("Data was downloaded and extracted successfully")
            print(f"Files extracted to: {data_target_directory}")
            print(f"Tytuł: {data['data']['attributes']['title']}")
            print(f"Liczba pobrań: {data['data']['attributes']['downloads_count']}")
            print(f"Liczba wyświetleń: {data['data']['attributes']['views_count']}")
        else:
            print(f"Failed to download the file: {download.status_code}")
else:
    print(f"File not found: {response.status_code}")

Failed to download the file: 404


data source: https://dane.gov.pl/pl/dataset/1667,lista-imion-wystepujacych-w-rejestrze-pesel-osoby-zyjace

In [100]:
url_male_names = '../data/Lista_imion_wystepujacych_w_rejestrze_PESEL/lista imion meskich w rejestrze PESEL stan na 19.01.2024  imie pierwsze_54109/8_-_Wykaz_imion_męskich_osób_żyjących_wg_pola_imię_pierwsze_występujących_w_rejestrze_PESEL_bez_zgonów.csv'
url_female_names = '../data/Lista_imion_wystepujacych_w_rejestrze_PESEL/lista imion zenskich w rejestrze PESEL stan na 19.01.2024  imie pierwsze_54110/8_-_Wykaz_imion_żeńskich_osób_żyjących_wg_pola_imię_pierwsze_występujących_w_rejestrze_PESEL_bez_zgonów.csv'

In [101]:
male_names = pd.read_csv(url_male_names)
female_names = pd.read_csv(url_female_names)

df = pd.concat([male_names, female_names])
df.info(), df.sample(10)

<class 'pandas.core.frame.DataFrame'>
Index: 61527 entries, 0 to 24473
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   IMIĘ_PIERWSZE     61526 non-null  object
 1   PŁEĆ              61527 non-null  object
 2   LICZBA_WYSTĄPIEŃ  61527 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


(None,
       IMIĘ_PIERWSZE       PŁEĆ  LICZBA_WYSTĄPIEŃ
 588           AHMAD  MĘŻCZYZNA               598
 17015     BULGANMAA    KOBIETA                 2
 210           LESIA    KOBIETA              7660
 29126      OKTAVIUS  MĘŻCZYZNA                 2
 2534          MOHIT  MĘŻCZYZNA                59
 14246        TSYREN  MĘŻCZYZNA                 5
 12559       LALAINE    KOBIETA                 4
 1490           MIŁA    KOBIETA                93
 3442        NICOLO'  MĘŻCZYZNA                38
 13489       CARLENE    KOBIETA                 3)

In [102]:
df.rename(columns={
    'IMIĘ_PIERWSZE':'name',
    'PŁEĆ':'sex',
    'LICZBA_WYSTĄPIEŃ':'occurence'
}, inplace=True)

df['name'] = df['name'].astype('string')

df['sex'] = df['sex'].map(lambda x: "man" if x == 'MĘŻCZYZNA' else 'woman').astype('string')

df['target_value'] = df['sex'].map(lambda x:  1 if x == 'man' else 0).astype('int')

df['sex'].value_counts()

sex
man      37053
woman    24474
Name: count, dtype: Int64

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61527 entries, 0 to 24473
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          61526 non-null  string
 1   sex           61527 non-null  string
 2   occurence     61527 non-null  int64 
 3   target_value  61527 non-null  int64 
dtypes: int64(2), string(2)
memory usage: 2.3 MB


In [104]:
df = df.dropna(axis=0)
df['name_length'] = df['name'].map(lambda x: len(x))
df.sample(5)


Unnamed: 0,name,sex,occurence,target_value,name_length
8355,KAMILL,man,11,1,6
12735,ROBBE,man,6,1,5
31295,VON,man,2,1,3
18788,LOTI,man,3,1,4
36521,WANDILE,man,2,1,7


In [105]:
X = df[['name_length']].values
y = df['target_value'].values

In [106]:
def train_and_predict(X, y, model=None, metric=accuracy_score):
    if model is None:
        model = DummyClassifier()
    model.fit(X, y)
    y_pred = model.predict(X)
    
    print(model)
    print('Distribution:', dict(zip(*np.unique(y_pred, return_counts=True))))
    print('Accuracy: {:.2f}%'.format(metric(y, y_pred) * 100))
    return y_pred

In [107]:
df['DummyClassifier_pred'] = train_and_predict(X, y)

DummyClassifier()
Distribution: {np.int64(1): np.int64(61526)}
Accuracy: 60.22%


In [108]:
model = LogisticRegression()
# X = X.reshape(-1,1)
df['LogReg_pred'] = train_and_predict(X, y, model)

LogisticRegression()
Distribution: {np.int64(1): np.int64(61526)}
Accuracy: 60.22%


In [109]:
model = DecisionTreeClassifier()
df['DTreeClass_pred'] =train_and_predict(X, y, model)

DecisionTreeClassifier()
Distribution: {np.int64(0): np.int64(31), np.int64(1): np.int64(61495)}
Accuracy: 60.23%


Basic feature engineering

In [110]:
vowels = set('aeiouyAEIOUY')

df['starts_with_vowel'] = df['name'].map(lambda x: x[0] in vowels).astype('int')
df['ends_with_vowel'] = df['name'].map(lambda x: x[-1] in vowels).astype('int')

df['vowels_qty'] = df['name'].apply(lambda name: sum(1 for x in name if x in vowels))

df.sample(10)

Unnamed: 0,name,sex,occurence,target_value,name_length,DummyClassifier_pred,LogReg_pred,DTreeClass_pred,starts_with_vowel,ends_with_vowel,vowels_qty
14718,ZIHUI,woman,3,0,5,1,1,1,0,1,3
2303,TRACY,woman,42,0,5,1,1,1,0,1,2
15961,EUNSUK,woman,3,0,6,1,1,1,1,0,3
4437,MIKOLAI,man,26,1,7,1,1,1,0,1,4
13493,QUE,man,5,1,3,1,1,1,0,1,2
16893,THỊ THANH HOA,woman,2,0,13,1,1,1,0,1,3
25218,VAN NGO,man,2,1,7,1,1,1,0,1,2
1206,DANNY,man,183,1,5,1,1,1,0,1,2
30988,NEMATKHON,man,2,1,9,1,1,1,0,0,3
9583,NGOC HA,woman,5,0,7,1,1,1,0,1,2


In [115]:
model = LogisticRegression()
features = ['name_length','starts_with_vowel', 'ends_with_vowel', 'vowels_qty']

X = df[features]
train_and_predict(X, y, model)


LogisticRegression()
Distribution: {np.int64(0): np.int64(28312), np.int64(1): np.int64(33214)}
Accuracy: 70.96%


array([1, 1, 1, ..., 0, 0, 1])

In [116]:
model = DecisionTreeClassifier()
train_and_predict(X, y, model)

DecisionTreeClassifier()
Distribution: {np.int64(0): np.int64(27145), np.int64(1): np.int64(34381)}
Accuracy: 71.64%


array([1, 1, 1, ..., 0, 0, 1])

In [117]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
train_and_predict(X, y, model)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=2, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Distribution: {np.int64(0): np.int64(28382), np.int64(1): np.int64(33144)}
Accuracy: 71.00%


array([1, 1, 1, ..., 0, 0, 1])