## Bibliothèques

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from google.colab import files
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np

## Import fichiers

In [None]:
df1 = pd.read_csv("/content/Data to analyse - 1 month retention.csv")
df1.set_index("distinct_id", inplace = True)
df1bis = df1.copy()
df1bis = df1bis.drop(df1bis[df1bis['retention 1 month'] == 0].index)
df1bis.drop('email', axis=1, inplace=True)
df1bis.drop('created', axis=1, inplace=True)
df1bis.drop('created_day', axis=1, inplace=True)
scaler = StandardScaler()
df1_standard = scaler.fit_transform(df1bis.drop(columns=['retention 1 month']))
df1_standard = pd.DataFrame(df1_standard, index= df1bis.index, columns=df1bis.columns.drop('retention 1 month'))
df1_standard['retention 1 month'] = df1bis['retention 1 month']

df3 = pd.read_csv("/content/Data to analyse - 3 months retention.csv")
df3.set_index("distinct_id", inplace = True)
df3bis = df3.copy()
df3bis = df3bis.drop(df3bis[df3bis['Retention 3 months'] == 0].index)
df3bis.drop('email', axis=1, inplace=True)
df3bis.drop('created', axis=1, inplace=True)
scaler = StandardScaler()
df3_standard = scaler.fit_transform(df3bis.drop(columns=['Retention 3 months']))
df3_standard = pd.DataFrame(df3_standard, index= df3bis.index, columns=df3bis.columns.drop('Retention 3 months'))
df3_standard['Retention 3 months'] = df3bis['Retention 3 months']

## RegLog 1 month - En pause, peut-être utile à l'avenir

In [None]:
# créer un objet de régression logistique
model = LogisticRegression()

# diviser les données en ensemble d'entraînement et ensemble de test
from sklearn.model_selection import train_test_split

train_data1, test_data1, train_target1, test_target1 = train_test_split(df1_standard.iloc[:, :-1], df1_standard.iloc[:, -1], test_size=0.2, random_state=0)

# entraîner le modèle sur l'ensemble d'entraînement
model.fit(train_data1, train_target1)

# prédire les valeurs de l'ensemble de test
predictions1 = model.predict(test_data1)

# évaluer la performance du modèle
print(confusion_matrix(test_target1, predictions1))
print(classification_report(test_target1, predictions1))
print(accuracy_score(test_target1, predictions1))


## RegLog 3 months - En pause, peut-être utile à l'avenir

In [None]:
# créer un objet de régression logistique
model = LogisticRegression()

# diviser les données en ensemble d'entraînement et ensemble de test
from sklearn.model_selection import train_test_split

train_data3, test_data3, train_target3, test_target3 = train_test_split(df3_standard.iloc[:, :-1], df3_standard.iloc[:, -1], test_size=0.2, random_state=0)

# entraîner le modèle sur l'ensemble d'entraînement
model.fit(train_data3, train_target3)

# prédire les valeurs de l'ensemble de test
predictions3 = model.predict(test_data3)

# évaluer la performance du modèle
print(confusion_matrix(test_target3, predictions3))
print(classification_report(test_target3, predictions3))
print(accuracy_score(test_target3, predictions3))

## KBest 1 month

**The KBest method is used in machine learning to select the best variables to predict a target variable.**

In our case, we want to predict retention, which is our target variable. This method uses several indicators to measure the correlation between the variables and the target.

Since all our variables are numeric, the KBest method will apply an analysis of variance method (**f_classif**). It measures the variance between groups of values and calculates an F score that measures the difference between groups. The higher the score, the more important the variable is considered for predicting the target variable.

Once this score is calculated for each variable, we can determine which ones have the most influence on the prediction.

In [None]:
X = df1_standard.drop('retention 1 month', axis=1)
y = df1_standard['retention 1 month']
k_best = SelectKBest(f_classif, k = 'all')
k_best.fit(X, y)
scores1 = pd.DataFrame({'Variable': X.columns, 'Score': k_best.scores_})
scores1 = scores1.sort_values('Score', ascending=False)
print(scores1)

Thanks to this analysis using the KBest method, we can consider "Leaderboard related pageview", "challenge entered (7 days)", "Challenge completed (7 days)" and "Activity != Walking (7 days)" (in this order) as the most important variables in predicting retention.

## KBest 3 months

In [None]:
X = df3_standard.drop('Retention 3 months', axis=1)
y = df3_standard['Retention 3 months']
k_best = SelectKBest(f_classif, k='all')
k_best.fit(X, y)
scores3 = pd.DataFrame({'Variable': X.columns, 'Score': k_best.scores_})
scores3 = scores3.sort_values('Score', ascending=False)
print(scores3)

## PCA

The PCA algorithm works by finding the directions of maximum variance in the original dataset and projecting the data onto these new axes. The first principal component is the direction with the highest variance, followed by the second principal component, which is orthogonal to the first and has the second highest variance, and so on. Each principal component is a linear combination of the original features, and together they account for the total variance in the data.

Once the principal components have been computed, the number of components to retain can be selected based on the amount of variance that they explain. This can be done by looking at the eigenvalues of the covariance matrix, which represent the amount of variance in each principal component. Components with high eigenvalues are considered to be more important and are retained, while components with low eigenvalues can be discarded.

In [None]:
X = df1_standard.drop('retention 1 month', axis=1)
y = df1_standard['retention 1 month']
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
pd.DataFrame(pca.components_, columns=X.columns, index=['PC1', 'PC2'])

Thanks to this PCA, we can affirm that the variables with the most impact on the total variance, and therefore the most significant ones. In other words, they are the ones whose variation will have the most impact on the variation of the others. Indeed, the coefficients above correspond to the respective contribution of each variable to the two principal components (aX1 + bX2 + etc...). The largest coefficients in absolute value are therefore those of the most important variables, regardless of whether we consider PC1 or PC2. So for example if a variable has a large coefficient for PC1 and PC2, it is very important. If another has a strong component for PC1 and a weak one for PC2, it is important, as much as if PC1 and PC2 were reversed.

The goal is that we can therefore extract the same variables as before as being predominant: "Leaderboard related pageview", "challenge entered (7 days)", "Challenge completed (7 days)" and "Activity != Walking (7 days)"

However, these results are not perfect, because although these variables are the most significant, the differences with the other variables remain quite small.

## Corrélations 1 month

In [None]:
correlations1 = df1bis.corr(method = 'pearson')["retention 1 month"]

# Afficher les résultats
print(correlations1)

Using the .corr() function, we use a basic function from the pandas library to measure the correlation between our variables and the target. This function is similar to the KBest method, but uses the Pearson correlation coefficient. The latter is suitable for continuous variables, but is also relevant in our case, with numeric variables. Once again, the results are similar to the previous ones: "Leaderboard related pageview", "challenge entered (7 days)", "Challenge completed (7 days)", and "Activity != Walking (7 days)" are the variables most correlated to the target according to this method.

In [None]:
# Using Random Forest method
# Split the data into X (features) and y (target variable)
X1bis = df1bis.drop("retention 1 month", axis=1)
y1bis = df1bis["retention 1 month"]

# Train a Random Forest regressor model
rf1 = RandomForestRegressor()
rf1.fit(X1bis, y1bis)

# Get the feature importances
feature_importances1 = pd.DataFrame(rf1.feature_importances_,
                                   index = X1bis.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

# Print the feature importances
print(feature_importances1)

The "**feature_importances_**" function calculates the relative importance of each variable in the model, that is, the contribution of each variable to the model's accuracy. It estimates the importance of each column by computing the reduction in average impurity (measured by the Gini index or entropy) obtained when using that variable to split the decision trees in the ensemble. More specifically, the function **calculates the average reduction in impurity weighted by the number of observations in each node for each feature**.

## Corrélations 3 months

In [None]:
correlations3 = df3bis.corr()["Retention 3 months"]

# Afficher les résultats
print(correlations3)

In [None]:
# Using Random Forest method
# Split the data into X (features) and y (target variable)
X3bis = df3bis.drop("Retention 3 months", axis=1)
y3bis = df3bis["Retention 3 months"]

# Train a Random Forest regressor model
rf3 = RandomForestRegressor()
rf3.fit(X3bis, y3bis)

# Get the feature importances
feature_importances3 = pd.DataFrame(rf3.feature_importances_,
                                   index = X3bis.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

# Print the feature importances
print(feature_importances3)

## Multivariate Regression 1 month

In [None]:
Xmv1 = df1_standard.drop(['retention 1 month'], axis=1)
ymv1 = df1_standard['retention 1 month']

# Initialisation du modèle
model = LinearRegression()

# Entrainement du modèle
model.fit(Xmv1, ymv1)

# Affichage des coefficients de régression
coefficients = pd.DataFrame({
    'Variable': Xmv1.columns,
    'Coefficients': model.coef_
})
print(coefficients.sort_values(by = 'Coefficients', ascending = False))

Here we have created a 'multivariate regression' model, in order to highlight the importance of each variable in the prediction of the target. The regression model used returns a function of the form: y = aX1 + bX2 + ... + zXn. Each Xi corresponds to our different variables, and the coefficient preceding them to their importance in the prediction model. Thus, we obtain for each variable its importance. As in the case of the other models, we find a strong predominance of "Leaderboard related pageview", "Challenge entered (7 days)", "Challenge completed (7 days)" and "Activity != Walking (7 days)".

## k-means clustering 1 month

In [None]:
df1cluster = df1.copy()
df1cluster.drop('email', axis=1, inplace=True)
df1cluster.drop('created', axis=1, inplace=True)
df1cluster.drop('created_day', axis=1, inplace=True)
df1cluster = df1cluster.drop(df1cluster[df1cluster['retention 1 month'] == 0].index)
df1cluster_test = df1cluster.copy()
challE_avg = df1cluster['challenge entered (7 days)'].median()
df1cluster_test.loc[df1cluster_test['challenge entered (7 days)'] >= challE_avg, 'challenge entered (7 days)'] = 1
df1cluster_test.loc[df1cluster_test['challenge entered (7 days)'] < challE_avg, 'challenge entered (7 days)'] = 0
challC_avg = df1cluster['challenge completed (7 days)'].median()
df1cluster_test.loc[df1cluster_test['challenge completed (7 days)'] >= challC_avg, 'challenge completed (7 days)'] = 1
df1cluster_test.loc[df1cluster_test['challenge completed (7 days)'] < challC_avg, 'challenge completed (7 days)'] = 0
article_avg = df1cluster['article read (7 days)'].median()
df1cluster_test.loc[df1cluster_test['article read (7 days)'] >= article_avg, 'article read (7 days)'] = 1
df1cluster_test.loc[df1cluster_test['article read (7 days)'] < article_avg, 'article read (7 days)'] = 0
interaction_avg = df1cluster['interaction created (7 days)'].median()
df1cluster_test.loc[df1cluster_test['interaction created (7 days)'] >= interaction_avg, 'interaction created (7 days)'] = 1
df1cluster_test.loc[df1cluster_test['interaction created (7 days)'] < interaction_avg, 'interaction created (7 days)'] = 0
leaderboard_avg = df1cluster['leaderboard related pageview'].median()
df1cluster_test.loc[df1cluster_test['leaderboard related pageview'] >= leaderboard_avg, 'leaderboard related pageview'] = 1
df1cluster_test.loc[df1cluster_test['leaderboard related pageview'] < leaderboard_avg, 'leaderboard related pageview'] = 0
activity_avg = df1cluster['activity != Walking (7 days)'].median()
df1cluster_test.loc[df1cluster_test['activity != Walking (7 days)'] >= activity_avg, 'activity != Walking (7 days)'] = 1
df1cluster_test.loc[df1cluster_test['activity != Walking (7 days)'] < activity_avg, 'activity != Walking (7 days)'] = 0
track_avg = df1cluster['tracking app (7 days)'].median()
df1cluster_test.loc[df1cluster_test['tracking app (7 days)'] >= 1, 'tracking app (7 days)'] = 1
df1cluster_test.loc[df1cluster_test['tracking app (7 days)'] < 1, 'tracking app (7 days)'] = 0
club_avg = df1cluster['club joined (7 days)'].median()
df1cluster_test.loc[df1cluster_test['club joined (7 days)'] >= club_avg, 'club joined (7 days)'] = 1
df1cluster_test.loc[df1cluster_test['club joined (7 days)'] < club_avg, 'club joined (7 days)'] = 0
scaler = StandardScaler()
df1_standard_cluster = scaler.fit_transform(df1cluster_test.drop(columns=['retention 1 month']))
df1_standard_cluster = pd.DataFrame(df1_standard_cluster, index= df1cluster_test.index, columns=df1cluster_test.columns.drop('retention 1 month'))
df1_standard_cluster['retention 1 month'] = df1cluster_test['retention 1 month']
print(df1cluster_test)

In [None]:
##Methode du coude
# Extraction des données de la dataframe
X = df1_standard.values

# Normalisation des données
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Liste des valeurs de K à tester
K = range(1, 10)

# Liste des valeurs des variances intra-cluster pour chaque K
inertias = []

for k in K:
    # Créer un modèle de KMeans pour chaque K
    model = KMeans(n_clusters=k)

    # Ajuster le modèle aux données
    model.fit(X)

    # Ajouter l'inertie (variance intra-cluster) à la liste
    inertias.append(model.inertia_)

# Tracer le graphique de la méthode du coude
plt.plot(K, inertias, 'bx-')
plt.xlabel('Nombre de clusters (K)')
plt.ylabel('Variance intra-cluster')
plt.title("Méthode du coude pour déterminer le nombre optimal de clusters")
plt.show()

In [None]:
##Clustering
# Choisir le nombre de clusters (k)
k = 3

# Sélectionner les colonnes sur lesquelles faire le clustering
cols = [df1_standard.columns[8]]

# Sélectionner les données pour le clustering
data = df1_standard[cols]

# Initialiser le modèle de clustering
kmeans = KMeans(n_clusters=k, random_state=0)

# Effectuer le clustering
kmeans.fit(data)

# Ajouter une colonne "cluster" à votre tableau de données
df1bis["retention1_cluster"] = kmeans.labels_

# Afficher le tableau de données avec la colonne "cluster"
stats = df1bis.groupby('retention1_cluster').agg(['mean'])#, 'median', 'var', 'std'])
print(stats)
#stats.to_csv('stats_retention_cluster.csv', index=True)
#files.download('stats_retention_cluster.csv')
#df1cluster.head()
comptage = df1bis['retention1_cluster'].value_counts()
comptage

In [None]:
df1cluster = df1.copy()
df1cluster.drop('email', axis=1, inplace=True)
df1cluster.drop('created', axis=1, inplace=True)
df1cluster.drop('created_day', axis=1, inplace=True)
df1cluster = df1cluster.drop(df1cluster[df1cluster['retention 1 month'] == 0].index)
df1cluster = df1cluster.drop(df1cluster[df1cluster['challenge entered (7 days)'] > 2].index)
df1cluster = df1cluster.drop(df1cluster[df1cluster['leaderboard related pageview'] > 2].index)
scaler = StandardScaler()
df1_standard_cluster = scaler.fit_transform(df1cluster.drop(columns=['retention 1 month']))
df1_standard_cluster = pd.DataFrame(df1_standard_cluster, index= df1cluster.index, columns=df1cluster.columns.drop('retention 1 month'))
df1_standard_cluster['retention 1 month'] = df1cluster['retention 1 month']
#print(df1cluster_test)

##Clustering
# Choisir le nombre de clusters (k)
k = 4

# Sélectionner les colonnes sur lesquelles faire le clustering
cols = [df1_standard_cluster.columns[2],df1_standard_cluster.columns[3],df1_standard_cluster.columns[5],df1_standard_cluster.columns[7]]

# Sélectionner les données pour le clustering
data = df1_standard_cluster[cols]

# Initialiser le modèle de clustering
kmeans = KMeans(n_clusters=k, random_state=0)

# Effectuer le clustering
kmeans.fit(data)

# Ajouter une colonne "cluster" à votre tableau de données
df1cluster["retention1_cluster"] = kmeans.labels_

# Afficher le tableau de données avec la colonne "cluster"
stats = df1cluster.groupby('retention1_cluster').agg(['mean'])#, 'median', 'var', 'std'])
print(stats)
stats.to_csv('stats_retention_cluster.csv', index=True)
#files.download('stats_retention_cluster.csv')
#df1cluster.head()
comptage = df1cluster['retention1_cluster'].value_counts()
comptage
#stats.to_csv('stats_retention_cluster.csv', index=True)
#files.download('stats_retention_cluster.csv')

In [None]:
from sklearn.cluster import SpectralClustering

# Définir le nombre de clusters
k = 3

# Instancier le modèle de clustering spectral
model = SpectralClustering(n_clusters=k, affinity='nearest_neighbors')

# Effectuer le clustering sur les données
clusters = model.fit_predict(df1_standard)

# Ajouter les résultats du clustering à votre dataframe
df1bis['cluster'] = clusters

stats = df1bis.groupby('cluster').agg(['mean'])
print(stats)
comptage = df1bis['cluster'].value_counts()
print(comptage)

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster

# Sélectionner les colonnes sur lesquelles vous souhaitez effectuer le clustering
data = df1_standard.iloc[:, :-1].values

# Calculer la matrice de distance entre les observations
dist_matrix = linkage(data, method='ward')

# Définir le nombre de clusters
k = 6

# Assigner chaque observation à un cluster
clusters = fcluster(dist_matrix, k, criterion='maxclust')

# Ajouter les résultats du clustering à votre dataframe
df1bis['cluster'] = clusters
stats = df1bis.groupby('cluster').agg(['mean'])
print(stats)
comptage = df1bis['cluster'].value_counts()
print(comptage)

## WoE & IV

The following code allows us to measure several indicators: First the weight of evidence (WoE) intra-variable, the WoE on whole variables, and the information value (IV).

**Let's start with the within-variable WoE**. The algorithm will split each variable into equiprobable segments, i.e. they will have approximately the same number of users. Then for each of these segments, we will compute the logarithm of the ratio between active users after one month in this segment, and those not active after one month. We weight this ratio so that in the end, the logarithm is positive when 70% of the users in the segment are active after one month. This allows us to set threshold values for each variable, which allow us to identify for which values at least 70% of the users remain active after one month.

**The total WoE** is the sum of the WoE of each segment of a variable. It is not very valuable because it does not give any information about the threshold values we are looking for. Nevertheless, it allows us to have an idea of the importance of a variable, but the measure is less relevant here than the ones performed previously.

**It is the same case for the IV**, which we will not exploit either.

In [None]:
# Charger les données
df100 = df1bis.copy()
df100.iloc[:, -1] = df100.iloc[:, -1].apply(lambda x: 1 if x > 0 else x)
df100bis = df100.copy()

# Définir la variable cible et les variables prédictives
target = 'retention 1 month'
predictors = list(df100.columns[:-1])

# Calculer le WoE et l'IV pour chaque variable prédictive
def woe_iv(df, var, target, bins=30):
    """
    Calcule le WoE et l'IV pour une variable donnée
    """
    # Créer des intervalles de valeur pour la variable
    cut_points = pd.qcut(df[var], q=bins, duplicates = 'drop', retbins=True)[1]
    if len(cut_points) < 2:
        cut_points = [-np.inf, np.inf]
    labels = range(len(cut_points) - 1)

    # Créer une variable discrète avec les intervalles de valeur
    df[var + '_bin'] = pd.cut(df[var], bins=cut_points, labels=labels, include_lowest=True)

    # Calculer les proportions de cas positifs et négatifs pour chaque intervalle de valeur
    pos_df = df[df[target]==1].groupby(var + '_bin').size().reset_index(name='pos')
    neg_df = df[df[target]==0].groupby(var + '_bin').size().reset_index(name='neg')
    pos_df = pos_df.merge(neg_df, on=var + '_bin', how='left', suffixes=('_pos', '_neg'))

    # Remplacer les valeurs manquantes par 0.5 pour éviter les erreurs de calcul
    #pos_df.fillna(0.5, inplace = True)

    # Ajouter une constante pour éviter les valeurs infinies
    pos_df['pos'] = pos_df['pos'] + 0.5
    pos_df['neg'] = pos_df['neg'] + 0.5

    # Calculer les proportions totales de cas positifs et négatifs
    pos_total = pos_df['pos'].sum()
    neg_total = pos_df['neg'].sum()

    # Calculer le WoE et l'IV pour chaque intervalle de valeur
    pos_df['woe'] = np.log(pos_df['pos']/pos_df['neg']) - np.log(7/3)# - np.log(pos_df['neg']/neg_total)
    pos_df['iv'] = pos_df['woe'] * (pos_df['pos']/pos_total - pos_df['neg']/neg_total)

    # Calculer le WoE global et l'IV global pour la variable
    woe = pos_df['woe'].sum()
    iv = pos_df['iv'].sum()

    # Créer un dictionnaire avec le WoE pour chaque intervalle de valeur
    woe_dict = {}
    for i, cut in enumerate(cut_points[:-1]):
        label = labels[i]
        woe_dict[str(cut) + '-' + str(cut_points[i+1])] = pos_df.loc[pos_df[var + '_bin']==label, 'woe'].iloc[0]

    return woe, iv, woe_dict

woe_dictbis = {}
woe_dict = {}
iv_dict = {}

for var in predictors:
    woe, iv, woe_dicti = woe_iv(df100, var, target)
    woe_dict[var] = woe_dicti
    woe_dictbis[var] = woe
    iv_dict[var] = iv

# Afficher les résultats
print('WoE chaque feature:')
for k, v in woe_dict.items():
    print(k, v)
    print(len(v))
print('WoE:')
for k, v in woe_dictbis.items():
    print(k, v)

print('IV:')
for k, v in iv_dict.items():
    print(k, v)

Here we find an algorithm that for each variable, calculates the minimum value from which 60% or more of the users with a value (for this variable) greater than or equal to the minimum value are still active after one month.

In [None]:
resultats = {}

# Pour chaque colonne numérique sauf la colonne cible
for nom_colonne in df100bis.columns:
    if nom_colonne != 'retention 1 month':
        # Tri des valeurs de la colonne par ordre croissant
        valeurs_triees = sorted(df100bis[nom_colonne].unique())

        # Pour chaque valeur triée
        for valeur in valeurs_triees:
            # Sélection des lignes ayant une valeur supérieure ou égale à cette valeur
            lignes_selectionnees = df100bis[df100bis[nom_colonne] >= valeur]

            # Calcul du pourcentage de lignes avec la valeur 1 dans la colonne cible
            pourcentage = lignes_selectionnees['Retention 1 month'].mean() * 100

            # Si le pourcentage est supérieur ou égal à 60%
            if pourcentage >= 50:
                # Stockage de la valeur minimale pour cette colonne
                resultats[nom_colonne] = valeur
                break

# Affichage des résultats
print("To reach 60% of active users after 1 month :")
for key, value in resultats.items():
    print("minimum amount of " + key + ": " +str(value))