In [20]:
import pandas as pd
import numpy as np
data = pd.read_csv("./cleaned_dataset.csv")

In [21]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

In [22]:
target = 'worldwide'
# Separate features and target
X = data.drop(target, axis=1)
y = data[target]

In [32]:
from sklearn.ensemble import RandomForestRegressor

def selection_top_K_rf(k):
    # Train a Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)

    # Get feature importances and sort them in descending order
    feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
    feature_importances = feature_importances.sort_values(ascending=False)
    print("Ranked features:\n", feature_importances)

    selected_features = feature_importances.nlargest(k).index

    # Print the selected features
#     print("\nSelected features:", list(selected_features))
    return X[list(selected_features)]
    

In [43]:
df_filtered_top_k = selection_top_K_rf(20)
df_filtered_top_k.to_csv('movie_top_20_features_random_forest.csv')

Ranked features:
 budget_x                     0.476234
duration                     0.071018
movie_facebook_likes         0.070424
actor_3_name                 0.042093
actor_3_facebook_likes       0.036529
producer                     0.030277
cinematographer              0.028529
actor_1_name                 0.028055
cast_total_facebook_likes    0.025967
actor_2_facebook_likes       0.025862
main_actor_4                 0.021417
writer                       0.019563
director                     0.018828
composer                     0.018310
director_facebook_likes      0.017571
actor_1_facebook_likes       0.016109
actor_2_name                 0.013525
Adventure                    0.006817
Animation                    0.005679
mpaa                         0.005630
Drama                        0.003083
Fantasy                      0.003001
Sci-Fi                       0.002302
Musical                      0.002086
Action                       0.001997
Comedy                       0.0

In [36]:
## FORWARD SELECTION
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

def forward_selection(k):
    # Forward selection
#     k = 3  # Number of features to select
    selected_features = []
    remaining_features = list(X.columns)
    best_score = -np.inf

    for i in range(k):
        scores = []
        for feature in remaining_features:
            candidate_features = selected_features + [feature]
            model = LinearRegression()
            score = np.mean(cross_val_score(model, X[candidate_features], y, cv=5))
            scores.append((score, feature))

        max_score, best_feature = max(scores, key=lambda x: x[0])
        if max_score > best_score:
            best_score = max_score
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    print(selected_features)
    return X[list(selected_features)]



In [45]:
forward_selection_top_k = forward_selection(20)
forward_selection_top_k.to_csv('movie_top_20_features_forward_selection.csv')

['budget_x', 'movie_facebook_likes', 'War', 'actor_3_facebook_likes', 'Drama', 'Action', 'cast_total_facebook_likes', 'cinematographer', 'mpaa', 'composer']


In [69]:
## PCA feature extraction
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def PCA_topk(k):

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # Apply PCA
    pca = PCA(n_components=k)
    X_pca = pca.fit_transform(X_scaled)

    # Print the explained variance ratio
    print("Explained variance ratio:", pca.explained_variance_ratio_)
    print("Total pca explained variance ratio: " + str(np.sum(pca.explained_variance_ratio_)))

    # Convert the PCA features to a DataFrame
    X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(k)])
    print("\nPCA features:\n", X_pca_df)
    return X_pca_df


def select_n_components_pca(X, min_explained_variance=0.95, max_n_components=None):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    if max_n_components is None:
        max_n_components = X_scaled.shape[1]
    
    for n_components in range(1, max_n_components + 1):
        pca = PCA(n_components=n_components)
        pca.fit(X_scaled)
        explained_variance = np.sum(pca.explained_variance_ratio_)
#         print("Total pca explained variance ratio: " + str(explained_variance))
        
        if explained_variance >= min_explained_variance:
            
            return n_components
    return max_n_components

In [72]:
print("Best number of n_components: " + str(select_n_components_pca(X)) + "\n\n")

X_pca_df = PCA_topk(33)

X_pca_df.to_csv("movie_33_features_PCA.csv")

Best number of n_components: 33


Explained variance ratio: [0.09358793 0.08381232 0.06170242 0.05281869 0.03921347 0.0353663
 0.03314483 0.032474   0.03113107 0.02925178 0.027917   0.0277049
 0.02696581 0.02682506 0.02573544 0.0252641  0.0246973  0.02424298
 0.02264085 0.02109007 0.02057138 0.02009106 0.01994568 0.01772651
 0.01712843 0.01686927 0.01619037 0.01581269 0.01464657 0.01298353
 0.01273594 0.01191182 0.01155672]
Total pca explained variance ratio: 0.9537563073611698

PCA features:
           PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0    1.814842  1.776478 -2.891774 -2.799604 -1.493521  1.412760  0.232862   
1    3.724712  2.340243 -0.790350 -1.615667 -0.526416 -0.176052  1.678845   
2    9.408334  2.102616  2.533374 -0.863711  0.556569  1.349779 -1.737072   
3    1.186898  1.670752 -3.033047 -2.242103 -1.663696  0.977912 -1.182994   
4    3.528630  1.281801 -1.179892 -1.015578 -0.856827 -0.205374  0.927110   
..        ...       ...       ...      