In [1]:
import json
import pandas as pd

from google.oauth2 import service_account

### Dataset import

In [2]:
with open("ensai-2024-81c8c40b7933.json") as source:
    info = json.load(source)
credentials = service_account.Credentials.from_service_account_info(info)

prenoms = pd.read_gbq("ml.prenoms", project_id="ensai-2024", credentials=credentials)

In [3]:
prenoms.head()

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
0,1,_PRENOMS_RARES,1900,13,37
1,1,_PRENOMS_RARES,1900,57,37
2,1,_PRENOMS_RARES,1900,59,44
3,1,_PRENOMS_RARES,1900,62,34
4,1,_PRENOMS_RARES,1900,75,45


In [7]:
prenoms.shape

(3784673, 5)

### Data pre-processing

In [4]:
prenoms.isna().sum()

sexe        0
preusuel    0
annais      0
dpt         0
nombre      0
dtype: int64

In [40]:
preprocessing = prenoms[prenoms['preusuel'] != "_PRENOMS_RARES"]
preprocessing = prenoms[prenoms["preusuel"].str.len() > 2]
preprocessing.shape

(3784163, 5)

In [42]:
preprocessing.loc[:, 'nombre'] = preprocessing['nombre'].astype(int)

In [43]:
preprocessing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3784163 entries, 0 to 3784672
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   sexe      object
 1   preusuel  object
 2   annais    object
 3   dpt       object
 4   nombre    int64 
dtypes: int64(1), object(4)
memory usage: 173.2+ MB


In [44]:
# Count occurences of annais and dpt by preusuel
preprocessing = preprocessing.groupby(['preusuel', 'sexe']).agg({
    'nombre' : sum
}).reset_index()

In [56]:
df = (preprocessing
      .iloc[preprocessing.groupby("preusuel")["nombre"].idxmax()]
      .drop(["nombre"], axis=1)
      )

### Feature engineering

In [52]:
import numpy as np

def encode_prenom(prenom):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    vector_size = len(alphabet)
    
    # Initialiser un vecteur de zéros de la taille de l'alphabet
    vector = np.zeros(vector_size, dtype=int)
    
    # Convertir le prénom en minuscules pour éviter la sensibilité à la casse
    prenom = prenom.lower()
    
    # Remplir le vecteur en fonction des lettres présentes dans le prénom
    for letter in prenom:
        if letter in alphabet:
            index = alphabet.index(letter)
            vector[index] += 1
    
    return pd.Series(vector, index=list(alphabet))

In [59]:
X = df['preusuel'].apply(encode_prenom)

In [64]:
y = df["sexe"].astype(int) - 1

In [65]:
X.shape, y.shape

((36113, 26), (36113,))

### Model building

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Model Selection
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

In [70]:
from sklearn.linear_model import LogisticRegression

regL = LogisticRegression(random_state=42)

# Train the model
regL.fit(X, y)

In [81]:
regL.predict([encode_prenom("Cyril")])



array([0])