In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ast

from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler



In [None]:
#import file .csv
df = pd.read_csv('Dating App Dataset Update.csv')
df.head()

Unnamed: 0,User ID,Age,Gender,Height,Interests,Looking For,Children,Education Level,Occupation,Swiping History,Frequency of Usage
0,1,24,Female,5.95,"['Reading', 'Sports', 'Travel']",Long-term Relationship,No,High School,Business Owner,82,Daily
1,2,28,Female,5.33,"['Reading', 'Hiking', 'Cooking']",Friendship,No,Master's Degree,Entrepreneur,29,Daily
2,3,18,Female,5.61,['Cooking'],Long-term Relationship,No,High School,Social Media Influencer,58,Daily
3,4,33,Male,5.46,"['Cooking', 'Reading', 'Sports']",Friendship,Maybe,Ph.D.,Business Owner,63,Weekly
4,5,26,Female,5.1,"['Reading', 'Movies', 'Sports']",Long-term Relationship,Maybe,Ph.D.,Engineer,7,Daily


In [None]:
# =========================
# 3. Feature engineering for visualization
# =========================
def feature_engineering(df_in):
    df_out = df_in.copy()
    
    # Age Groups
    df_out['Age_Group'] = pd.cut(df_out['Age'], bins=[0, 21, 28, 100], labels=['Young', 'Adult', 'Mature'])
    
    # Busy Job Flag
    busy_jobs = ['Doctor', 'Engineer', 'Teacher', 'Business Owner', 'Entrepreneur']
    df_out['Is_Busy'] = df_out['Occupation'].apply(lambda x: 1 if x in busy_jobs else 0)
    
    # Relationship Goal (Serious vs Casual)
    serious_goals = ['Marriage', 'Long-term Relationship']
    df_out['Is_Serious'] = df_out['Looking For'].apply(lambda x: 1 if x in serious_goals else 0)
    
    # Personality Flags (from Interests)
    # First, convert string representation of list to actual list
    df_out['Interests_List'] = df_out['Interests'].apply(ast.literal_eval)
    df_out['Is_Introvert'] = df_out['Interests_List'].apply(lambda x: 1 if 'Reading' in x else 0)
    
    extrovert_interests = ['Sports', 'Travel', 'Hiking']
    df_out['Is_Extrovert'] = df_out['Interests_List'].apply(lambda x: 1 if any(i in x for i in extrovert_interests) else 0)
    
    df_out['Interests_Count'] = df_out['Interests_List'].apply(len)
    
    return df_out


In [None]:
df = feature_engineering(df)

In [None]:
df['Frequency of Usage'].value_counts(normalize=True)

Frequency of Usage
Weekly     0.433579
Daily      0.358180
Monthly    0.208241
Name: proportion, dtype: float64

In [None]:
# =========================
# 3) Build X (features) and y (target)
# =========================
# y = label you want to predict
y = df["Frequency of Usage"]  # <-- THIS is y (target)

# X = everything used to predict y (drop ID + target + raw text columns you don't want directly)
X = df.drop(columns=["User ID", "Frequency of Usage", "Interests", "Interests_List"])  # <-- THIS is X (features)

# Encode Interests as multi-label (multi-hot)
mlb = MultiLabelBinarizer()
interests_encoded = mlb.fit_transform(df["Interests_List"])
interests_df = pd.DataFrame(interests_encoded, columns=[f"Interest_{c}" for c in mlb.classes_])

# Combine engineered numeric features + interests multi-hot
X = pd.concat([X.reset_index(drop=True), interests_df.reset_index(drop=True)], axis=1)

# One-hot encode categorical columns
categorical_cols = ["Gender", "Looking For", "Children", "Education Level", "Occupation", "Age_Group"]
X = pd.get_dummies(X, columns=categorical_cols)

# Encode target to numeric
le = LabelEncoder()
y_encoded = le.fit_transform(y)

Giờ đến Machine Learning

In [None]:
# Train–Test Split (80:20)
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

In [None]:
#MODEL MACHINE LEARNING
#random forest classifier
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,
    random_state=42
)
rf.fit(X, y_encoded)  #huấn luyện rồi

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,12
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:

# Scale dữ liệu (bắt buộc với Logistic)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
log_clf = LogisticRegression(
    max_iter=1000,
    multi_class='auto',
    random_state=42
)

log_clf.fit(X_train_scaled, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# KNN cũng cần scale
scaler_knn = StandardScaler()
X_train_scaled_knn = scaler_knn.fit_transform(X_train)
X_test_scaled_knn = scaler_knn.transform(X_test)

# Train KNN
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train_scaled_knn, y_train)



0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [None]:
from sklearn.model_selection import RepeatedKFold, cross_validate


# =========================
# Cross-validation setup
# =========================
rkf = RepeatedKFold(
    n_splits=5,
    n_repeats=3,
    random_state=42
)

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
}

# =========================
# Run CV
# =========================
cv_rf  = cross_validate(rf, X, y_encoded, cv=rkf, scoring=scoring)
cv_log = cross_validate(log_clf, X, y_encoded, cv=rkf, scoring=scoring)
cv_knn = cross_validate(knn, X, y_encoded, cv=rkf, scoring=scoring)

# =========================
# Print results
# =========================
def print_cv(name, cv_result):
    print(f"\n{name}")
    for m in scoring:
        scores = cv_result[f'test_{m}']
        print(f"{m:<9}: Mean={scores.mean():.4f}, Std={scores.std():.4f}")

print("CROSS-VALIDATION RESULTS")
print_cv("Random Forest", cv_rf)
print_cv("Logistic Regression", cv_log)
print_cv("KNN", cv_knn)


CROSS-VALIDATION RESULTS

Random Forest
accuracy : Mean=0.6409, Std=0.0078
precision: Mean=0.6670, Std=0.0075
recall   : Mean=0.6409, Std=0.0078
f1       : Mean=0.6244, Std=0.0078

Logistic Regression
accuracy : Mean=0.5081, Std=0.0088
precision: Mean=0.4599, Std=0.0298
recall   : Mean=0.5081, Std=0.0088
f1       : Mean=0.4472, Std=0.0085

KNN
accuracy : Mean=0.4793, Std=0.0084
precision: Mean=0.4781, Std=0.0100
recall   : Mean=0.4793, Std=0.0084
f1       : Mean=0.4751, Std=0.0090


In [None]:


# =========================
# Cross-validate with train score
# =========================
cv_result = cross_validate(
    rf,
    X,
    y_encoded,
    cv=rkf,
    scoring='accuracy',
    return_train_score=True
)

# =========================
# Compute gap
# =========================
train_mean = cv_result['train_score'].mean()
test_mean  = cv_result['test_score'].mean()
gap = train_mean - test_mean

print("CROSS-VALIDATION TRAIN vs TEST")
print(f"Train accuracy (CV): {train_mean:.4f}")
print(f"Test  accuracy (CV): {test_mean:.4f}")
print(f"Gap (Train - Test): {gap:.4f}")


CROSS-VALIDATION TRAIN vs TEST
Train accuracy (CV): 0.8538
Test  accuracy (CV): 0.6409
Gap (Train - Test): 0.2129


Khám phá một số hành vi

In [None]:
!pip install mlxtend




You should consider upgrading via the 'C:\Users\Docs\CodeProject\DM\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [None]:
import pandas as pd
import ast
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 1. Import Dataset
df = pd.read_csv('Dating App Dataset Update.csv')

# 2. Feature Engineering
def feature_engineering_for_rules(df_in):
    df_out = df_in.copy()
    
    # Tạo nhóm tuổi (Age Groups)
    df_out['Age_Group'] = pd.cut(df_out['Age'], bins=[0, 21, 28, 100], labels=['Age_Young', 'Age_Adult', 'Age_Mature'])
    
    # Flag công việc bận rộn (Busy Job)
    busy_jobs = ['Doctor', 'Engineer', 'Teacher', 'Business Owner', 'Entrepreneur']
    df_out['Busy_Status'] = df_out['Occupation'].apply(lambda x: 'Status_Busy' if x in busy_jobs else 'Status_Normal')
    
    # Mục tiêu mối quan hệ (Serious vs Casual)
    serious_goals = ['Marriage', 'Long-term Relationship']
    df_out['Goal_Type'] = df_out['Looking For'].apply(lambda x: 'Goal_Serious' if x in serious_goals else 'Goal_Casual')
    
    # Xử lý danh sách sở thích (Interests)
    # Chuyển chuỗi "['A', 'B']" thành list thực thụ
    df_out['Interests_List'] = df_out['Interests'].apply(ast.literal_eval)
    
    # Gắn nhãn cho giới tính và tần suất sử dụng để dễ phân biệt trong luật
    df_out['Gender_Label'] = 'Gender_' + df_out['Gender']
    df_out['Usage_Label'] = 'Usage_' + df_out['Frequency of Usage']
    
    return df_out

df_fe = feature_engineering_for_rules(df)

# 3. Tạo danh sách Transactions
# Mỗi dòng (user) sẽ là một "giỏ hàng" chứa các đặc trưng định tính
transactions = []
for i in range(len(df_fe)):
    # Lấy các đặc trưng cơ bản đã được gắn nhãn
    features = [
        str(df_fe.loc[i, 'Age_Group']),
        df_fe.loc[i, 'Gender_Label'],
        df_fe.loc[i, 'Busy_Status'],
        df_fe.loc[i, 'Goal_Type'],
        df_fe.loc[i, 'Usage_Label']
    ]
    
    # Thêm danh sách sở thích (Interests) vào giao dịch
    interests = [f"Interest_{item}" for item in df_fe.loc[i, 'Interests_List']]
    
    # Kết hợp tất cả thành một giao dịch duy nhất
    transactions.append(features + interests)

# 4. Tìm Association Rules
# Chuyển đổi sang dạng One-Hot Encoding
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Tìm các tập mục phổ biến (frequent itemsets)
# min_support=0.05 nghĩa là mục đó xuất hiện ít nhất trong 5% dữ liệu
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)

# Tạo các luật kết hợp
# Sử dụng metric "lift" để tìm các mối quan hệ mạnh hơn ngẫu nhiên
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

# Sắp xếp theo độ tin cậy (confidence) hoặc lift
rules = rules.sort_values(by="lift", ascending=False)

# Hiển thị kết quả
print("Top 10 Association Rules:")
display(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(30))

Top 10 Association Rules:


Unnamed: 0,antecedents,consequents,support,confidence,lift
51,(Usage_Daily),"(Status_Normal, Age_Young)",0.050934,0.142201,1.684599
46,"(Status_Normal, Age_Young)",(Usage_Daily),0.050934,0.60339,1.684599
90,"(Usage_Weekly, Status_Normal)",(Interest_Reading),0.053938,0.446154,1.50429
91,(Interest_Reading),"(Usage_Weekly, Status_Normal)",0.053938,0.181862,1.50429
35,"(Usage_Daily, Goal_Casual)",(Age_Young),0.053867,0.316919,1.47281
36,(Age_Young),"(Usage_Daily, Goal_Casual)",0.053867,0.250332,1.47281
41,(Usage_Daily),"(Goal_Serious, Age_Young)",0.050647,0.141402,1.390056
38,"(Goal_Serious, Age_Young)",(Usage_Daily),0.050647,0.49789,1.390056
155,(Usage_Weekly),"(Status_Busy, Interest_Reading, Goal_Casual)",0.056156,0.129517,1.38843
152,"(Status_Busy, Interest_Reading, Goal_Casual)",(Usage_Weekly),0.056156,0.601994,1.38843


In [None]:
# 1. Lấy tất cả các luật (đã tạo ở bước trước)
# Giả sử biến 'rules' đã chứa toàn bộ kết quả từ association_rules

# 2. Định nghĩa hàm kiểm tra xem vế phải có chứa Usage_Label hay không
def filter_usage_consequents(consequent_set):
    # Kiểm tra xem có phần tử nào trong set bắt đầu bằng 'Usage_' không
    return any('Usage_' in str(item) for item in consequent_set)

# 3. Tiến hành lọc
usage_rules = rules[rules['consequents'].apply(filter_usage_consequents)].copy()

# 4. Sắp xếp theo Lift để tìm các mối quan hệ mạnh nhất tác động đến tần suất sử dụng
usage_rules = usage_rules.sort_values(by="lift", ascending=False)

# 5. Hiển thị kết quả
print(f"Tìm thấy {len(usage_rules)} luật có vế phải là tần suất sử dụng (Usage):")
display(usage_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(20))

# 6. (Tùy chọn) Lưu ra file CSV để báo cáo
# usage_rules.to_csv('usage_behavior_rules.csv', index=False)

Tìm thấy 81 luật có vế phải là tần suất sử dụng (Usage):


Unnamed: 0,antecedents,consequents,support,confidence,lift
46,"(Status_Normal, Age_Young)",(Usage_Daily),0.050934,0.60339,1.684599
91,(Interest_Reading),"(Usage_Weekly, Status_Normal)",0.053938,0.181862,1.50429
36,(Age_Young),"(Usage_Daily, Goal_Casual)",0.053867,0.250332,1.47281
38,"(Goal_Serious, Age_Young)",(Usage_Daily),0.050647,0.49789,1.390056
152,"(Status_Busy, Interest_Reading, Goal_Casual)",(Usage_Weekly),0.056156,0.601994,1.38843
26,"(Gender_Female, Age_Young)",(Usage_Daily),0.052078,0.492224,1.374237
10,"(Status_Normal, Age_Adult)",(Usage_Daily),0.073467,0.491623,1.372557
159,"(Status_Busy, Goal_Casual)","(Usage_Weekly, Interest_Travel)",0.05029,0.15812,1.363575
50,(Age_Young),"(Status_Normal, Usage_Daily)",0.050934,0.236702,1.359433
32,(Age_Young),"(Gender_Male, Usage_Daily)",0.052436,0.243684,1.356612
