In [92]:
%pip install pandas numpy matplotlib seaborn scipy scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [93]:
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from scipy.io.arff import MetaData, loadarff
from pandas import DataFrame, Series, crosstab
from numpy import ndarray
from numpy.random import shuffle
import numpy as np
import pandas as pd
from IPython.display import display, Markdown
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer, KNNImputer
import seaborn as sns
import sys

EPSILON: float = sys.float_info.epsilon
NL='\n'
TAB='\t'

# Récupération et traitement des données (cf. EDA)

In [94]:
# Chargement du jeu de données
data: ndarray
meta: MetaData
data, meta = loadarff('speeddating.arff')

original_df: DataFrame = DataFrame(data)
# Conversion des types des colonnes
for col in original_df.select_dtypes([object]):
    try:
        original_df[col] = original_df[col].astype(int)
    except:
        try:
            original_df[col] = original_df[col].astype(float)
        except:
            original_df[col] = original_df[col].astype(str)

# Correction des noms de colonnes
original_df.rename(
    columns={
        'sinsere_o': 'sincere_o',
        'intellicence_important': 'intelligence_important',
        'ambition': 'ambitious',
        'ambtition_important': 'ambitious_important',
        'ambition_partner': 'ambitious_partner',
        'ambitous_o': 'ambitious_o'
    },
    inplace=True
)

# Récupération des colonnes utiles
columns: list[str] = ['age', 'age_o', 'ambitious', 'ambitious_important', 'ambitious_o', 'ambitious_partner', 'art', 'attractive', 'attractive_important', 'attractive_o', 'attractive_partner', 'clubbing', 'concerts', 'd_age', 'decision', 'decision_o', 'dining', 'exercise', 'expected_happy_with_sd_people', 'expected_num_matches', 'funny', 'funny_important', 'funny_o', 'funny_partner', 'gaming', 'gender', 'guess_prob_liked', 'hiking', 'importance_same_race', 'importance_same_religion', 'intelligence', 'intelligence_important', 'intelligence_o', 'intelligence_partner', 'interests_correlate', 'like', 'match', 'met', 'movies', 'music', 'pref_o_ambitious', 'pref_o_attractive', 'pref_o_funny', 'pref_o_intelligence', 'pref_o_shared_interests', 'pref_o_sincere', 'race', 'race_o', 'reading', 'samerace', 'shared_interests_important', 'shared_interests_o', 'shared_interests_partner', 'shopping', 'sincere', 'sincere_important', 'sincere_o', 'sincere_partner', 'sports', 'theater', 'tv', 'tvsports', 'yoga' ]
df: DataFrame = original_df[columns].copy()

# Remplacement des catégories '?' par 'Other'
df.loc[df['race'] == '?', 'race'] = 'Other'
df.loc[df['race_o'] == '?', 'race_o'] = 'Other'

# Bornage des valeurs a 10 dans les activités 'gaming' et 'reading'
df.loc[df['gaming'] > 10, 'gaming'] = 10
df.loc[df['reading'] > 10, 'reading'] = 10

# Correction des valeurs d'ages
df.loc[df['age_o'].isna() & df['age'].notna(), 'd_age'] = float('nan')
df.loc[df['age_o'].notna() & df['age'].isna(), 'd_age'] = float('nan')
df.loc[df['age_o'].isna() & df['age'].isna(), 'd_age'] = float('nan')

mean_d_age: float = df['d_age'].describe()['mean']
df.loc[df['d_age'].isna(), 'd_age'] = mean_d_age

df.loc[df['age_o'].isna() & df['age'].notna(), 'age_o'] = df['age'] + df['d_age']
df.loc[df['age_o'].notna() & df['age'].isna(), 'age'] = df['age_o'] + df['d_age']
df.loc[df['age_o'].isna() & df['age'].isna(), ['age', 'age_o']] = [df['age'].describe()['mean'], df['age'].describe()['mean'] + mean_d_age]

df.drop(columns=['match'], inplace=True)

# Imputation des données (cf. ml_impute)

In [95]:
transformer = ColumnTransformer(
    transformers=[
		('race', SimpleImputer(strategy='constant', fill_value='Other'), ['race', 'race_o']),
		('hobbies', SimpleImputer(strategy='constant', fill_value=0), ['art', 'clubbing', 'concerts', 'dining', 'exercise', 'gaming', 'hiking', 'movies', 'music', 'reading', 'shopping', 'sports', 'theater', 'tv', 'tvsports', 'yoga']),
		('met', SimpleImputer(strategy='most_frequent'), ['met']),
		('criterion', SimpleImputer(strategy='mean'), ['sincere', 'sincere_partner', 'sincere_o', 'pref_o_sincere', 'sincere_important', 'funny', 'funny_partner', 'funny_o', 'pref_o_funny', 'funny_important', 'attractive', 'attractive_partner', 'attractive_o', 'pref_o_attractive', 'attractive_important', 'ambitious', 'ambitious_partner', 'ambitious_o', 'pref_o_ambitious', 'ambitious_important', 'intelligence', 'intelligence_partner', 'intelligence_o', 'pref_o_intelligence', 'intelligence_important', 'shared_interests_partner', 'shared_interests_o', 'pref_o_shared_interests', 'shared_interests_important']),
		('importance_race_religion', SimpleImputer(strategy='mean'), ['importance_same_race', 'importance_same_religion']),
		('remains', SimpleImputer(strategy='mean'), ['expected_happy_with_sd_people', 'expected_num_matches', 'guess_prob_liked', 'like', 'interests_correlate']),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform="pandas")

In [96]:
encoders = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(sparse_output=False), ['gender', 'race', 'race_o'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform="pandas")

# Entrainement des modèles

In [99]:
ohe: OneHotEncoder = OneHotEncoder(sparse_output=False).fit(df[['gender', 'race', 'race_o']])
preprocess_pipeline: Pipeline = Pipeline([
    ('impute', transformer),
    ('encoders', encoders),
    ('scaler', StandardScaler().set_output(transform="pandas"))
])

In [100]:
from sklearn.model_selection import train_test_split

preprocessed_df: DataFrame = preprocess_pipeline.fit_transform(df) # type: ignore

preprocessed_df_decision_X: DataFrame = preprocessed_df[preprocessed_df.columns.difference(['decision'])]
preprocessed_df_decision_y: Series = preprocessed_df['decision']

X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_df_decision_X,
    preprocessed_df_decision_y,
    test_size=0.2,
    stratify=preprocessed_df_decision_y,
    random_state=42
)


knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

print(knn.score(X_train, y_train))
print(knn.score(X_test, y_test))


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
preprocessed_df_decision_o_X: DataFrame = preprocessed_df[preprocessed_df.columns.difference(['decision_o'])]
preprocessed_df_decision_o_y: Series = preprocessed_df['decision_o']

X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(
    preprocessed_df_decision_o_X,
    preprocessed_df_decision_o_y,
    test_size=0.2,
    stratify=preprocessed_df_decision_o_y,
    random_state=42
)


knn: KNeighborsClassifier = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_o, y_train_o)

print(knn.score(X_train_o, y_train_o))
print(knn.score(X_test_o, y_test_o))


0.779319606087735
0.6569212410501193


In [90]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(estimator=weak_learner, n_estimators=50)

model_ada_tree.fit(X_train, y_train)

print(model_ada_tree.score(X_train, y_train))
print(model_ada_tree.score(X_test, y_test))
print(model_ada_tree.feature_importances_.shape)
print(preprocessed_df_decision_X.columns.shape)
print(DataFrame([model_ada_tree.feature_importances_.tolist()], columns=preprocessed_df_decision_X.columns.tolist()))

0.7973739182333631
0.8025059665871122
(70,)
(70,)
   age  age_o  ambitious  ambitious_important  ambitious_o  ambitious_partner  \
0  0.0    0.0        0.0             0.015398          0.0           0.012328   

   art  attractive  attractive_important  attractive_o  ...  shopping  \
0  0.0    0.037833              0.009121           0.0  ...  0.011414   

    sincere  sincere_important  sincere_o  sincere_partner    sports  theater  \
0  0.014122                0.0        0.0         0.021671  0.017348      0.0   

    tv  tvsports      yoga  
0  0.0       0.0  0.038834  

[1 rows x 70 columns]
