In [1]:
pip install statsmodels




### Implémentation de la régression binomiale négative

In [4]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [6]:
# Génération de données simulées
np.random.seed(42)
n = 1000  # Nombre d'observations
X1 = np.random.randn(n)
X2 = np.random.randn(n)
X3 = np.random.randn(n)

In [8]:
# Génération des taux moyens en utilisant une combinaison linéaire
mu = np.exp(1 + 0.5 * X1 - 0.3 * X2 + 0.7 * X3)

In [10]:
# Génération de la variable réponse selon une loi binomiale négative
alpha = 2  # Paramètre de dispersion
Y = np.random.negative_binomial(n=1/alpha, p=1/(1 + alpha * mu), size=n)

In [12]:
# Création d'un DataFrame
data = pd.DataFrame({"X1": X1, "X2": X2, "X3": X3, "Y": Y})

In [14]:
data

Unnamed: 0,X1,X2,X3,Y
0,0.496714,1.399355,-0.675178,3
1,-0.138264,0.924634,-0.144519,0
2,0.647689,0.059630,-0.792420,1
3,1.523030,-0.646937,-0.307962,6
4,-0.234153,0.698223,-1.893615,0
...,...,...,...,...
995,-0.281100,1.070150,0.077481,2
996,1.797687,-0.026521,0.257753,0
997,0.640843,-0.881875,-1.241761,1
998,-0.571179,-0.163067,0.334176,1


In [16]:
# Séparation des données
X = data[["X1", "X2", "X3"]]
y = data["Y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X

Unnamed: 0,X1,X2,X3
0,0.496714,1.399355,-0.675178
1,-0.138264,0.924634,-0.144519
2,0.647689,0.059630,-0.792420
3,1.523030,-0.646937,-0.307962
4,-0.234153,0.698223,-1.893615
...,...,...,...
995,-0.281100,1.070150,0.077481
996,1.797687,-0.026521,0.257753
997,0.640843,-0.881875,-1.241761
998,-0.571179,-0.163067,0.334176


In [20]:
y

0      3
1      0
2      1
3      6
4      0
      ..
995    2
996    0
997    1
998    1
999    1
Name: Y, Length: 1000, dtype: int32

In [22]:
# Ajout d'une constante pour le modèle
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

In [24]:
X_train_const

Unnamed: 0,const,X1,X2,X3
29,1.0,-0.291694,-1.022793,-0.491636
535,1.0,0.047399,-1.594703,-0.295090
695,1.0,-0.309546,1.938929,1.431367
557,1.0,-0.432558,-0.803179,0.003376
836,1.0,1.550500,-1.143726,1.904137
...,...,...,...,...
106,1.0,1.886186,0.612774,-0.900621
270,1.0,1.441273,1.091310,0.347676
860,1.0,0.202923,-0.134309,-0.531214
435,1.0,0.074095,0.282580,-0.369527


In [26]:
X_test_const

Unnamed: 0,const,X1,X2,X3
521,1.0,0.543360,0.582098,2.985259
737,1.0,0.982691,-1.067803,0.342338
740,1.0,-1.840874,-0.104449,0.414866
660,1.0,-0.573662,1.317115,-1.175595
411,1.0,-1.124642,0.556230,1.537932
...,...,...,...,...
408,1.0,0.120296,-0.322680,0.811397
332,1.0,0.075805,-0.213443,-1.517874
208,1.0,0.515048,-1.402605,-0.313058
613,1.0,-1.125489,-0.077837,-0.006521


In [28]:
# Ajustement du modèle de régression binomiale négative
model = sm.GLM(y_train, X_train_const, family=sm.families.NegativeBinomial())
result = model.fit()



In [30]:
# Résumé des résultats
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      Y   No. Observations:                  800
Model:                            GLM   Df Residuals:                      796
Model Family:        NegativeBinomial   Df Model:                            3
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1748.4
Date:                Tue, 25 Mar 2025   Deviance:                       1247.1
Time:                        09:21:48   Pearson chi2:                 1.42e+03
No. Iterations:                     7   Pseudo R-squ. (CS):             0.4355
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9611      0.044     21.875      0.0

In [32]:
# Prédictions sur l'ensemble de test
y_pred = result.predict(X_test_const)

In [34]:
y_pred

521    23.719413
737     7.213904
740     1.465082
660     0.601289
411     3.819521
         ...    
408     5.362360
332     0.989238
208     3.969250
613     1.533918
78      1.640233
Length: 200, dtype: float64

In [36]:
# Affichage des premières prédictions
print("Prédictions :", y_pred[:5])

Prédictions : 521    23.719413
737     7.213904
740     1.465082
660     0.601289
411     3.819521
dtype: float64


Explication du code
Génération des données :

Trois variables explicatives (X1, X2, X3) sont générées aléatoirement.

La moyenne mu suit une fonction exponentielle pour garantir des valeurs positives.

La variable réponse Y suit une loi binomiale négative.

Ajustement du modèle :

Utilisation de la classe GLM de statsmodels avec la famille NegativeBinomial().

Affichage des coefficients et des statistiques du modèle.

Prédiction :

Le modèle est testé sur un jeu de données séparé (X_test).

On obtient des prédictions des valeurs de Y.