In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
def plot_correlation_matrix(df):
    corr_matrix = pd.DataFrame(df, columns=df.columns).corr()
    mask = (corr_matrix < -0.1) | (corr_matrix > 0.1)

    plt.figure(figsize=(24, 18))
    sns.heatmap(
        corr_matrix,
        annot=True,
        cmap="coolwarm",
        mask=~mask,
        cbar_kws={"label": "Correlation Coefficient"},
    )
    plt.title("Filtered Feature Correlation Matrix (|Corr| > 0.1)")
    plt.show()
    
    return

plot_correlation_matrix(behaviors_df)

In [None]:
def drop_columns(df, columns_to_drop):
    df = df.drop(columns=columns_to_drop, errors="raise")
    
    return df

columns_to_drop = [
    "impression_id",
    "impression_time",
    "article_ids_clicked",
    "impression_day_of_week",
    "impression_hour",
]

behaviors_df = drop_columns(behaviors_df, columns_to_drop)

In [None]:
def define_features_and_target(df):
    y = df['target']
    X = df.drop(columns=['target'])
    
    return X, y

X, y = define_features_and_target(behaviors_df)

In [None]:
def compute_mutual_information(X, y):
    mutual_info = mutual_info_classif(X, y, random_state=42)
    feature_importance = pd.DataFrame(
        {"Feature": X.columns, "Importance": mutual_info}
    ).sort_values(by="Importance", ascending=False, inplace=False)

    return feature_importance

mutual_information = compute_mutual_information(X, y)

In [None]:
def plot_mutual_information(feature_importance):
    plt.figure(figsize=(14, 8))
    ax = sns.barplot(x="Importance", y="Feature", data=feature_importance, color="blue")
    plt.title("Feature Importance Based on Mutual Information")

    for index, value in enumerate(feature_importance["Importance"]):
        plt.text(value, index + 0.1, f"  {value:.4f}", va="center", ha="left")

    plt.show()

plot_mutual_information(mutual_information)

In [None]:
def scale_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled

X_scaled = scale_features(X)

In [None]:
def split_data(X_scaled, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, random_state=random_state
    )

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X_scaled, y)

In [None]:
def compute_random_forest_importance(X_train, y_train):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)

    random_forest_importance = pd.DataFrame(
        {"Feature": X.columns, "Importance": rf.feature_importances_}
    ).sort_values(by="Importance", ascending=False)

    return random_forest_importance

random_forest_importance = compute_random_forest_importance(X_train, y_train)

In [None]:
def plot_random_forest_importance(random_forest_importance):
    plt.figure(figsize=(14, 8))
    sns.barplot(
        x="Importance", y="Feature", data=random_forest_importance, color="green"
    )
    plt.title("Random Forest Feature Importances")

    for index, value in enumerate(random_forest_importance["Importance"]):
        plt.text(value, index + 0.1, f"  {value:.4f}", va="center", ha="left")

    plt.show()

plot_random_forest_importance(random_forest_importance)

In [None]:
def perform_rfe(X_train, y_train, n_features=5):
    log_reg = LogisticRegression(max_iter=1000)
    rfe = RFE(estimator=log_reg, n_features_to_select=n_features)
    rfe.fit(X_train, y_train)

    rfe_ranking = pd.DataFrame({"Feature": X.columns, "Ranking": rfe.ranking_})
    rfe_ranking = rfe_ranking.sort_values(by="Ranking")

    print("RFE Feature Ranking:")
    print(rfe_ranking)

    return rfe_ranking

rfe_ranking = perform_rfe(X_train, y_train)

## LOGISTIC REGRESSION

In [None]:
def encode_device_type(df):
    """
    One-hot encode the 'device_type' column.
    """
    return pd.get_dummies(df, columns=['device_type'])

def categorize_time_features(df):
    """
    Convert time-based features (impression_hour and impression_day) into categories.
    """
    # Convert impression_hour into categories: night, morning, afternoon, evening
    df['impression_hour'] = pd.cut(
        df['impression_hour'], 
        bins=[0, 5, 11, 17, 23], 
        labels=['night', 'morning', 'afternoon', 'evening']
    )

    # Convert impression_day into categories: beginning, middle, end
    df['impression_day'] = pd.cut(
        df['impression_day'], 
        bins=[0, 9, 19, 31], 
        labels=['beginning', 'middle', 'end']
    )
    return df

def one_hot_encode_time_features(df):
    """
    Apply one-hot encoding to time features and additional categorical columns.
    """
    return pd.get_dummies(
        df, 
        columns=['impression_year', 'impression_month', 'impression_day', 
                 'impression_day_of_week', 'impression_hour']
    )

# Workflow
datasets["train/behaviors"]_exploded_encoded = encode_device_type(datasets["train/behaviors"]_exploded)
datasets["train/behaviors"]_exploded_encoded = categorize_time_features(datasets["train/behaviors"]_exploded_encoded)
datasets["train/behaviors"]_exploded_encoded = one_hot_encode_time_features(datasets["train/behaviors"]_exploded_encoded)

# Display Results
print(list(datasets["train/behaviors"]_exploded_encoded.columns), '')
print(datasets["train/behaviors"]_exploded_encoded.dtypes)

In [None]:
def merge_is_sso_and_subscriber(df):
    """
    Merge 'is_sso_user' and 'is_subscriber' into a single feature.
    The new feature will indicate if the user is either an SSO user or a subscriber.
    """
    df['is_sso_or_subscriber'] = df['is_sso_user'] | df['is_subscriber']
    return df

# Apply the function
datasets["train/behaviors"]_exploded = merge_is_sso_and_subscriber(datasets["train/behaviors"]_exploded)

# Verify the new feature
print(datasets["train/behaviors"]_exploded[['is_sso_user', 'is_subscriber', 'is_sso_or_subscriber']].head())


In [None]:
print(datasets["train/behaviors"]_exploded['is_sso_user'].unique())
print(datasets["train/behaviors"]_exploded['is_subscriber'].unique())


In [None]:
print(datasets["train/behaviors"]_exploded.corr()['is_sso_or_subscriber'])


In [None]:
features.append('is_sso_or_subscriber')


In [None]:
datasets["train/behaviors"]_exploded = datasets["train/behaviors"]_exploded.drop(columns=['is_sso_user', 'is_subscriber'])


In [None]:
# Step 1: Aggregate user history metrics
def compute_user_metrics(datasets["train/behaviors"]):
    """
    Compute user history metrics such as total interactions, average read time,
    most active day, most used device type, and toporite term.
    """
    user_metrics = datasets["train/behaviors"].groupby('user_id').agg(
        total_interactions=('impression_id', 'count'),  # Total number of interactions
        avg_read_time=('read_time', 'mean'),           # Average read time
        avg_scroll_percentage=('scroll_percentage', 'mean'),  # Avg scroll percentage
        most_active_day=('impression_day_of_week', lambda x: x.mode()[0] if not x.empty else None),  # Most common day
        most_used_device=('device_type', lambda x: x.mode()[0] if not x.empty else None),            # Most common device
        toporite_term=('article_ids_clicked', lambda x: x.mode()[0] if not x.empty else None)  # Most frequently clicked article/term
    ).reset_index()
    
    return user_metrics

# Step 2: Merge computed user metrics back to the main behaviors dataframe
def merge_user_metrics(datasets["train/behaviors"], user_metrics):
    """
    Merge computed user-level metrics back into the main dataframe.
    """
    merged_df = datasets["train/behaviors"].merge(user_metrics, on='user_id', how='left')
    return merged_df

# Step 3: Apply the functions to compute and merge metrics
user_metrics_df = compute_user_metrics(datasets["train/behaviors"])  # Compute metrics
datasets["train/behaviors"]_exploded = merge_user_metrics(datasets["train/behaviors"]_exploded, user_metrics_df)  # Merge metrics

# Step 4: Verify the merged dataframe
print(datasets["train/behaviors"]_exploded[['user_id', 'total_interactions', 'avg_read_time', 
                             'most_active_day', 'most_used_device', 'toporite_term']].head())


In [None]:
print(user_metrics_df.isnull().sum())
