In [43]:
import pandas as pd
import numpy as np
data = pd.read_csv("./clean_data_movieYear.csv")

In [44]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

In [45]:
target = 'worldwide'
# Separate features and target
X = data.drop(target, axis=1)
y = data[target]

In [41]:
from sklearn.ensemble import RandomForestRegressor

def selection_top_K_rf(k):
    # Train a Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)

    # Get feature importances and sort them in descending order
    feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
    feature_importances = feature_importances.sort_values(ascending=False)
    print("Ranked features:\n", feature_importances)

    selected_features = feature_importances.nlargest(k).index

    # Print the selected features
#     print("\nSelected features:", list(selected_features))
    return X[list(selected_features)]
    

In [42]:
df_filtered_top_k = selection_top_K_rf(10)
df_filtered_top_k[target] = y
df_filtered_top_k.to_csv('movie_top_10_features_random_forest.csv')

Ranked features:
 budget_x                     5.014775e-01
movie_facebook_likes         8.334132e-02
producer                     4.552092e-02
director                     3.607235e-02
duration                     2.873918e-02
composer                     2.750718e-02
movie_year                   2.577489e-02
cinematographer              2.526525e-02
actor_1_name                 2.426645e-02
director_facebook_likes      2.303906e-02
cast_total_facebook_likes    2.178750e-02
actor_2_facebook_likes       1.974810e-02
writer                       1.798451e-02
actor_3_facebook_likes       1.647799e-02
actor_2_name                 1.445740e-02
actor_1_facebook_likes       1.252528e-02
actor_3_name                 1.199768e-02
main_actor_4                 1.150857e-02
Mystery                      4.265040e-03
Horror                       4.176167e-03
Drama                        3.975196e-03
Biography                    3.860317e-03
Thriller                     3.611810e-03
Comedy          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_top_k[target] = y


In [29]:
## FORWARD SELECTION
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

def forward_selection(k):
    # Forward selection
#     k = 3  # Number of features to select
    selected_features = []
    remaining_features = list(X.columns)
    best_score = -np.inf

    for i in range(k):
        scores = []
        for feature in remaining_features:
            candidate_features = selected_features + [feature]
            model = LinearRegression()
            score = np.mean(cross_val_score(model, X[candidate_features], y, cv=5))
            scores.append((score, feature))

        max_score, best_feature = max(scores, key=lambda x: x[0])
        if max_score > best_score:
            best_score = max_score
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    print(selected_features)
    return X[list(selected_features)]



In [31]:
forward_selection_top_k = forward_selection(20)
forward_selection_top_k[target] = y
forward_selection_top_k.to_csv('movie_top_20_features_forward_selection.csv')

['budget_x', 'movie_facebook_likes', 'director', 'producer', 'Drama', 'actor_3_name', 'Animation', 'actor_2_name', 'composer', 'actor_1_name', 'cinematographer', 'Crime', 'main_actor_4', 'Western', 'actor_2_facebook_likes', 'actor_1_facebook_likes']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forward_selection_top_k[target] = y


In [8]:
## PCA feature extraction
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def PCA_topk(k):

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # Apply PCA
    pca = PCA(n_components=k)
    X_pca = pca.fit_transform(X_scaled)

    # Print the explained variance ratio
    print("Explained variance ratio:", pca.explained_variance_ratio_)
    print("Total pca explained variance ratio: " + str(np.sum(pca.explained_variance_ratio_)))

    # Convert the PCA features to a DataFrame
    X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(k)])
    print("\nPCA features:\n", X_pca_df)
    return X_pca_df


def select_n_components_pca(X, min_explained_variance=0.95, max_n_components=None):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    if max_n_components is None:
        max_n_components = X_scaled.shape[1]
    
    for n_components in range(1, max_n_components + 1):
        pca = PCA(n_components=n_components)
        pca.fit(X_scaled)
        explained_variance = np.sum(pca.explained_variance_ratio_)
#         print("Total pca explained variance ratio: " + str(explained_variance))
        
        if explained_variance >= min_explained_variance:
            
            return n_components
    return max_n_components

In [9]:
print("Best number of n_components: " + str(select_n_components_pca(X)) + "\n\n")

X_pca_df = PCA_topk(33)

X_pca_df.to_csv("movie_33_features_PCA.csv")

Best number of n_components: 33


Explained variance ratio: [0.09981045 0.08518346 0.06238873 0.05325513 0.03826354 0.03450454
 0.03242393 0.03169554 0.03037209 0.02861556 0.02734949 0.02708487
 0.026479   0.02617257 0.02516799 0.02468903 0.02430663 0.02365174
 0.0224019  0.02078311 0.02007168 0.01981134 0.01947987 0.01837068
 0.01697019 0.01645921 0.01637367 0.0154827  0.01467574 0.01276863
 0.01266677 0.01162149 0.01129535]
Total pca explained variance ratio: 0.950646617294324

PCA features:
           PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0    3.418088 -1.270470 -3.042267 -1.937114 -1.526693  1.405934  0.180697   
1    5.001858 -0.676549 -0.582073 -1.198566 -0.534209 -0.170420  1.684263   
2    9.157315  2.573989  3.438634 -0.778412  0.533007  1.338758 -1.694820   
3    2.791529 -1.449460 -3.141223 -1.399874 -1.696261  0.967209 -1.224623   
4    4.356266  0.170318 -1.006869 -0.604168 -0.862799 -0.201518  0.966824   
..        ...       ...       ...     

In [21]:
len(df_filtered_top_k)

1648

In [23]:
df_filtered_top_k.keys()

Index(['budget_x', 'movie_facebook_likes', 'director', 'producer', 'duration',
       'composer', 'cinematographer', 'actor_1_name',
       'director_facebook_likes', 'actor_2_facebook_likes',
       'cast_total_facebook_likes', 'writer', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'main_actor_4',
       'actor_3_name', 'Horror', 'Mystery', 'Biography', 'worldwide'],
      dtype='object')