In [6]:
import pandas as pd
import numpy as np

In [5]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
df=pd.read_csv('/content/drive/MyDrive/PVL/labelencodedjb.csv')

In [19]:
%%writefile f_test_anova.py
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression

def f_test_anova(X, Y):
    bestfeatures = SelectKBest(score_func=f_regression, k=10)
    fit = bestfeatures.fit(X, Y)
    featureScores = pd.DataFrame({'Feature': X.columns, 'Score': fit.scores_})
    ranked_features = featureScores.sort_values(by='Score', ascending=False).reset_index(drop=True)
    return ranked_features


Overwriting f_test_anova.py


In [9]:
%%writefile mutual_info.py
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_regression

def mutual_info(X, Y):
    bestfeatures = SelectKBest(score_func=mutual_info_regression, k=10)
    fit = bestfeatures.fit(X, Y)
    featureScores = pd.DataFrame({'Feature': X.columns, 'Score': fit.scores_})
    return featureScores.sort_values(by='Score', ascending=False)


Writing mutual_info.py


In [10]:
%%writefile extra_trees.py
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor

def extra_trees(X, Y):
    model = ExtraTreesRegressor()
    model.fit(X, Y)
    feature_importances = model.feature_importances_
    featureScores = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    return featureScores.sort_values(by='Importance', ascending=False)


Writing extra_trees.py


In [11]:
%%writefile permutation_importance_svr.py
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance

def permutation_importance_svr(X, Y, feature_names):
    model = SVR(kernel='linear')
    model.fit(X, Y)
    perm_importance = permutation_importance(model, X, Y)
    sorted_idx = perm_importance.importances_mean.argsort()

    plt.barh(feature_names[sorted_idx][-10:], perm_importance.importances_mean[sorted_idx][-10:])
    plt.xlabel("Permutation Importance")
    plt.title("Permutation Importance")
    plt.show()


Writing permutation_importance_svr.py


In [12]:
%%writefile seq_feature_selector.py
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import pandas as pd

def perform_feature_selection(X, Y, k_features=10):

    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)

    regressor = LinearRegression()

    selector = SFS(regressor,
                    k_features=k_features,
                    scoring='neg_mean_squared_error',
                    cv=5)


    selector.fit(X_scaled, Y)


    selected_feature_indices = selector.k_feature_idx_
    selected_feature_names = X.columns[list(selected_feature_indices)]

    return selected_feature_names, X_scaled, selector


Writing seq_feature_selector.py


In [13]:
%%writefile random_forest.py
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd

def random_forest(X, Y, k_features=10):

    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)


    rf_regressor = RandomForestRegressor()


    rf_regressor.fit(X_scaled, Y)


    importances = rf_regressor.feature_importances_


    indices = importances.argsort()[-k_features:]
    selected_feature_names = X.columns[indices]

    return selected_feature_names, X_scaled, rf_regressor

Writing random_forest.py


In [14]:
%%writefile main.py
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from seq_feature_selector import perform_feature_selection
from random_forest import random_forest
from f_test_anova import f_test_anova
from mutual_info import mutual_info
from extra_trees import extra_trees
from permutation_importance_svr import permutation_importance_svr


df = pd.read_csv("/content/drive/MyDrive/PVL/labelencodedjb.csv")


target_vars = ['reading_fee_paid', 'Number_of_Months', 'Coupon_Discount',
               'num_books', 'magazine_fee_paid', 'Renewal_Amount']
feature_vars = ['# created_date', 'id', 'created_by', 'transaction_branch_id',
                'transaction_type_id', 'amount_paid', 'Coupon_Discount',
                'magazine_fee_paid', 'payable_amount', 'reading_fee_paid',
                'reversed', 'over_due_adjustment_amount', 'last_card_number',
                'primus_amount', 'adjustment_amount', 'security_deposit',
                'Percentage_Share', 'subscription_id', 'member_branch_id',
                'transaction', 'Transaction_Month', 'Transaction_Year',
                'taxable_amount', 'TAX_AMOUNT', 'created_name', 'branch_name',
                'branch_type', 'member_card', 'Member_Name', 'email',
                'Number_of_Months', 'display_name', 'TAX_TYPE', 'Renewal_Amount',
                'GST_AMOUNT', 'SGST_AMOUNT', 'CGST_AMOUNT', 'IGST_AMOUNT',
                'Message_Status', 'Membership_expiry_date', 'TAX_STATE',
                'User_Name', 'applied_reward_points', 'over_due_adjustment_amount.1',
                'reading_fee_adjustment_amount', 'share_percentage',
                'membership_start_date', 'Member_status', 'no_of_deliveries',
                'num_books', 'num_magazine', 'referred_by', 'dailysales']

def run_algorithm(algorithm, X, Y):
    if algorithm == 'f_test_anova':
        return f_test_anova(X, Y)
    elif algorithm == 'mutual_info':
        return mutual_info(X, Y)
    elif algorithm == 'extra_trees':
        return extra_trees(X, Y)
    elif algorithm == 'permutation_importance_svr':
        feature_names = X.columns
        permutation_importance_svr(X, Y, feature_names)
        return None
    elif algorithm == 'seq_feature_selector':
        k_features = 10
        selected_features, X_scaled, selector = perform_feature_selection(X, Y, k_features)


        regressor = LinearRegression()
        X_selected = X_scaled[:, list(selector.k_feature_idx_)]
        regressor.fit(X_selected, Y)
        feature_importances = regressor.coef_


        feature_importances_filtered = [feature_importances[i] for i in range(len(feature_importances)) if i in selector.k_feature_idx_]
        selected_features_filtered = [selected_features[i] for i in range(len(selected_features)) if i in selector.k_feature_idx_]

        importance_df = pd.DataFrame({
            'Feature': selected_features_filtered,
            'Importance': feature_importances_filtered
        }).sort_values(by='Importance', ascending=False)

        return importance_df
    elif algorithm == 'random_forest':
        k_features = 10
        selected_features, X_scaled, rf_regressor = random_forest(X, Y, k_features)

        importances = rf_regressor.feature_importances_
        selected_feature_indices = [i for i in range(len(feature_vars)) if feature_vars[i] in selected_features]
        filtered_importances = [importances[i] for i in selected_feature_indices]


        importance_df = pd.DataFrame({
            'Feature': selected_features,
            'Importance': filtered_importances
        }).sort_values(by='Importance', ascending=False)

        return importance_df
    else:
        print("Algorithm not recognized.")
        return None

if __name__ == "__main__":
    results = {}

    for target in target_vars:
        print(f"Target variable: {target}")
        X = df[feature_vars]
        Y = df[target]


        algorithm = 'random_forest'
        result = run_algorithm(algorithm, X, Y)

        if result is not None:
            print(result.head(10))
            results[target] = result
            print("\n")


    for target, result in results.items():
        print(f"Scores for target variable '{target}':")
        print(result)
        print("\n")


Overwriting main.py


In [28]:
%%writefile main.py
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from seq_feature_selector import perform_feature_selection
from random_forest import random_forest
from f_test_anova import f_test_anova
from mutual_info import mutual_info
from extra_trees import extra_trees
from permutation_importance_svr import permutation_importance_svr

df = pd.read_csv("/content/drive/MyDrive/PVL/labelencodedjb.csv")

target_vars = ['reading_fee_paid', 'Number_of_Months', 'Coupon_Discount',
               'num_books', 'magazine_fee_paid', 'Renewal_Amount']
feature_vars = ['# created_date', 'id', 'created_by', 'transaction_branch_id',
                'transaction_type_id', 'amount_paid', 'Coupon_Discount',
                'magazine_fee_paid', 'payable_amount', 'reading_fee_paid',
                'reversed', 'over_due_adjustment_amount', 'last_card_number',
                'primus_amount', 'adjustment_amount', 'security_deposit',
                'Percentage_Share', 'subscription_id', 'member_branch_id',
                'transaction', 'Transaction_Month', 'Transaction_Year',
                'created_name', 'branch_name','branch_type', 'member_card',
                'Member_Name', 'email','Number_of_Months', 'display_name',
                'Renewal_Amount','Message_Status', 'Membership_expiry_date',
                'User_Name', 'applied_reward_points', 'over_due_adjustment_amount.1',
                'reading_fee_adjustment_amount', 'share_percentage',
                'membership_start_date', 'Member_status', 'no_of_deliveries',
                'num_books', 'num_magazine', 'referred_by', 'dailysales']

def run_algorithm(algorithm, X, Y):
    if algorithm == 'f_test_anova':
        return f_test_anova(X, Y)
    elif algorithm == 'mutual_info':
        return mutual_info(X, Y)
    elif algorithm == 'extra_trees':
        return extra_trees(X, Y)
    elif algorithm == 'permutation_importance_svr':
        feature_names = X.columns
        permutation_importance_svr(X, Y, feature_names)
        return None
    elif algorithm == 'seq_feature_selector':
        k_features = 10
        selected_features, X_scaled, selector = perform_feature_selection(X, Y, k_features)

        regressor = LinearRegression()
        X_selected = X_scaled[:, list(selector.k_feature_idx_)]
        regressor.fit(X_selected, Y)
        feature_importances = regressor.coef_

        feature_importances_filtered = [feature_importances[i] for i in range(len(feature_importances)) if i in selector.k_feature_idx_]
        selected_features_filtered = [selected_features[i] for i in range(len(selected_features)) if i in selector.k_feature_idx_]

        importance_df = pd.DataFrame({
            'Feature': selected_features_filtered,
            'Importance': feature_importances_filtered
        }).sort_values(by='Importance', ascending=False)

        return importance_df
    elif algorithm == 'random_forest':
        k_features = 10
        selected_features, X_scaled, rf_regressor = random_forest(X, Y, k_features)

        importances = rf_regressor.feature_importances_
        selected_feature_indices = [i for i in range(len(feature_vars)) if feature_vars[i] in selected_features]
        filtered_importances = [importances[i] for i in selected_feature_indices]

        importance_df = pd.DataFrame({
            'Feature': selected_features,
            'Importance': filtered_importances
        }).sort_values(by='Importance', ascending=False)

        return importance_df
    else:
        print("Algorithm not recognized.")
        return None

if __name__ == "__main__":
    target = input(f"Please enter a target variable from the list {target_vars}: ")

    if target not in target_vars:
        print(f"Invalid target variable. Please choose from {target_vars}.")
    else:
        X = df[feature_vars]
        Y = df[target]

        algorithms = ['f_test_anova', 'mutual_info', 'extra_trees', 'seq_feature_selector', 'random_forest']

        results = {}

        with open(f"results_{target}.txt", 'w') as file:
            for algorithm in algorithms:
                print(f"Running {algorithm} on target variable '{target}'")
                file.write(f"Running {algorithm} on target variable '{target}'\n")
                result = run_algorithm(algorithm, X, Y)

                if result is not None:
                    print(result.head(10))
                    file.write(result.head(10).to_string())
                    file.write("\n\n")
                    results[algorithm] = result
                    print("\n")




Overwriting main.py


In [44]:
%%writefile main.py
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from seq_feature_selector import perform_feature_selection
from random_forest import random_forest
from f_test_anova import f_test_anova
from mutual_info import mutual_info
from extra_trees import extra_trees
from permutation_importance_svr import permutation_importance_svr

df = pd.read_csv("/content/drive/MyDrive/PVL/labelencodedjb.csv")

target_vars = ['reading_fee_paid', 'Number_of_Months', 'Coupon_Discount',
               'num_books', 'magazine_fee_paid', 'Renewal_Amount']
feature_vars = ['# created_date', 'id', 'created_by', 'transaction_branch_id',
                'transaction_type_id', 'amount_paid', 'Coupon_Discount',
                'magazine_fee_paid', 'payable_amount', 'reading_fee_paid',
                'reversed', 'over_due_adjustment_amount', 'last_card_number',
                'primus_amount', 'adjustment_amount', 'security_deposit',
                'Percentage_Share', 'subscription_id', 'member_branch_id',
                'transaction', 'Transaction_Month', 'Transaction_Year',
                'created_name', 'branch_name', 'branch_type', 'member_card',
                'Member_Name', 'email', 'Number_of_Months', 'display_name',
                'Renewal_Amount', 'Message_Status', 'Membership_expiry_date',
                'User_Name', 'applied_reward_points', 'over_due_adjustment_amount.1',
                'reading_fee_adjustment_amount', 'share_percentage',
                'membership_start_date', 'Member_status', 'no_of_deliveries',
                'num_books', 'num_magazine', 'referred_by', 'dailysales']

def run_algorithm(algorithm, X, Y):
    if algorithm == 'f_test_anova':
        result = f_test_anova(X, Y)
        result = result.rename(columns={'Score': 'Importance'})
        return result
    elif algorithm == 'mutual_info':
        return mutual_info(X, Y).rename(columns={'Score': 'Importance'})
    elif algorithm == 'extra_trees':
        return extra_trees(X, Y).rename(columns={'Score': 'Importance'})
    elif algorithm == 'permutation_importance_svr':
        feature_names = X.columns
        return permutation_importance_svr(X, Y, feature_names).rename(columns={'Score': 'Importance'})
    elif algorithm == 'seq_feature_selector':
        k_features = 10
        selected_features, X_scaled, selector = perform_feature_selection(X, Y, k_features)

        regressor = LinearRegression()
        X_selected = X_scaled[:, list(selector.k_feature_idx_)]
        regressor.fit(X_selected, Y)
        feature_importances = regressor.coef_

        feature_importances_filtered = [feature_importances[i] for i in range(len(feature_importances)) if i in selector.k_feature_idx_]
        selected_features_filtered = [selected_features[i] for i in range(len(selected_features)) if i in selector.k_feature_idx_]

        importance_df = pd.DataFrame({
            'Feature': selected_features_filtered,
            'Importance': feature_importances_filtered
        }).sort_values(by='Importance', ascending=False)

        return importance_df
    elif algorithm == 'random_forest':
        k_features = 10
        selected_features, X_scaled, rf_regressor = random_forest(X, Y, k_features)

        importances = rf_regressor.feature_importances_
        selected_feature_indices = [i for i in range(len(feature_vars)) if feature_vars[i] in selected_features]
        filtered_importances = [importances[i] for i in selected_feature_indices]

        importance_df = pd.DataFrame({
            'Feature': selected_features,
            'Importance': filtered_importances
        }).sort_values(by='Importance', ascending=False)

        return importance_df
    else:
        print("Algorithm not recognized.")
        return None

if __name__ == "__main__":
    target = input(f"Please enter a target variable from the list {target_vars}: ")

    if target not in target_vars:
        print(f"Invalid target variable. Please choose from {target_vars}.")
    else:
        X = df[feature_vars]
        Y = df[target]

        algorithms = ['f_test_anova', 'mutual_info', 'extra_trees', 'seq_feature_selector', 'random_forest']

        results = {}
        rank_data = pd.DataFrame(columns=['Feature'])

        with open(f"results_{target}.txt", 'w') as file:
            for algorithm in algorithms:
                print(f"Running {algorithm} on target variable '{target}'")
                file.write(f"Running {algorithm} on target variable '{target}'\n")
                result = run_algorithm(algorithm, X, Y)

                if result is not None:
                    print(result.head(10))
                    file.write(result.head(10).to_string())
                    file.write("\n\n")
                    results[algorithm] = result


                    result['Rank'] = result['Importance'].rank(ascending=False, method='dense')
                    rank_data = pd.merge(rank_data, result[['Feature', 'Rank']], on='Feature', how='outer', suffixes=('', f'_{algorithm}'))

            # Final rank based on the average rank from all algorithms
            rank_data['Final_Rank'] = rank_data.filter(like='Rank').mean(axis=1)
            rank_data = rank_data.sort_values('Final_Rank').reset_index(drop=True)

            print("\nFinal Ranking:\n")
            print(rank_data[['Feature', 'Final_Rank']])
            file.write("\nFinal Ranking:\n")
            file.write(rank_data[['Feature', 'Final_Rank']].to_string())
            file.write("\n")


Overwriting main.py


In [50]:
!python main.py


Please enter a target variable from the list ['reading_fee_paid', 'Number_of_Months', 'Coupon_Discount', 'num_books', 'magazine_fee_paid', 'Renewal_Amount']: Renewal_Amount
Running f_test_anova on target variable 'Renewal_Amount'
                         Feature    Importance
0                 # created_date  7.286637e+06
1         Membership_expiry_date  1.851281e+04
2          membership_start_date  6.743743e+03
3               Transaction_Year  4.700202e+01
4                             id  4.224208e+01
5                     created_by  1.528927e+01
6  reading_fee_adjustment_amount  1.434489e+01
7               last_card_number  1.154039e+01
8                   created_name  9.834789e+00
9                subscription_id  8.679315e+00
Running mutual_info on target variable 'Renewal_Amount'
                   Feature  Importance
30          Renewal_Amount    3.429734
0           # created_date    3.428264
44              dailysales    3.421237
1                       id    3.352121
32