# Table of Contents

- [Import Libraries](#section1)
- [load Datasets](#section2)
- [standardize the data](#section3)
- [Methods](#section4)
    -    [Decision Tree](#section41)
    -    [Random Forest](#section42)
    -    [AdaBoost](#section43)
    -    [XGBoost](#section44)



## 1. Import necessary libraries <a id="section1"></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import ast
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor



## 2. Load Datasets <a id="section2"></a>

In [2]:
# Read the Parquet files into DataFrames
train = pd.read_parquet("./Data/train.parquet")
test = pd.read_parquet("./Data/test.parquet")
val = pd.read_parquet("./Data/val.parquet")
train_val = pd.read_parquet("./Data/train_val.parquet")

# Display the first few rows of each DataFrame to confirm
train.head()


Unnamed: 0,user_id,film,date,target,rating,year,watch_count,fan_count,like_count,review_count,...,'sentiment'_Editing_negative,'sentiment'_Special_Effects_negative,'sentiment'_Other_neutral,'sentiment'_Other_negative,watchlist_length,films_watched,films_this_year,lists_created,following,followers
0,dustymoth,salems-lot-2024,2024-10-06,1.0,2.39,2024.0,104715.0,16.0,14213.0,32580.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
1,dustymoth,alien-3,2024-09-07,4.0,2.82,1992.0,487939.0,269.0,68239.0,61964.0,...,1.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
2,dustymoth,lock-stock-and-two-smoking-barrels,2024-09-06,0.5,3.99,1998.0,363721.0,3624.0,96479.0,21158.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
3,dustymoth,morocco,2024-08-21,3.0,3.58,1930.0,19531.0,63.0,4734.0,3366.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0
4,dustymoth,maxxxine,2024-08-06,1.5,3.09,2024.0,775925.0,2112.0,193118.0,254823.0,...,0.0,0.0,0.0,0.0,72.0,2402.0,48.0,84.0,32.0,26.0


## 3. standardize the data <a id="section3"></a>

In [3]:
target_column = 'target'

# Separate the target column from the features
train_features = train.drop(columns=[target_column])
val_features = val.drop(columns=[target_column])

# Separate numeric columns
numeric_cols = train_features.select_dtypes(include=['number']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the numeric columns of the train data and transform both train and validation sets
train_features_scaled = train_features.copy()
val_features_scaled = val_features.copy()

train_features_scaled[numeric_cols] = scaler.fit_transform(train_features[numeric_cols])
val_features_scaled[numeric_cols] = scaler.transform(val_features[numeric_cols])

# Add the target column back to the scaled features
train_scaled = pd.concat([train_features_scaled, train[target_column]], axis=1)
val_scaled = pd.concat([val_features_scaled, val[target_column]], axis=1)

# ⭐ CORRECT PART HERE
X_train = train_scaled[numeric_cols]
y_train = train_scaled[target_column]

X_val = val_scaled[numeric_cols]
y_val = val_scaled[target_column]


## Result DataFrame

In [6]:
# Create empty dataframe
results_df = pd.DataFrame(columns=['model_name', 'param', 'rmse', 'mse'])

results_df.head()


Unnamed: 0,model_name,param,rmse,mse


## 4 Methods <a id="section4"></a>

## 4.1 Decision Tree <a id="section41"></a>

In [4]:
param_grid = [ {'max_depth': [2,5,10,20]},]
DT_reg = DecisionTreeRegressor()
grid_search = GridSearchCV(DT_reg, param_grid, cv=5,   scoring='neg_mean_squared_error', return_train_score=True ,n_jobs=2) 
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_depth': 10}

In [9]:
# 1. Train Decision Tree with best param
best_DT_reg = DecisionTreeRegressor(max_depth=10)
best_DT_reg.fit(X_train, y_train)

# 2. Predict on validation set
y_pred_val = best_DT_reg.predict(X_val)

# 3. Calculate MSE and RMSE
mse = mean_squared_error(y_val, y_pred_val)
rmse = np.sqrt(mse)

# 4. Save result
results_df = pd.concat([results_df, pd.DataFrame({
    'model_name': ['DecisionTreeRegressor'],
    'param': [{'max_depth':'10'}],
    'rmse': [rmse],
    'mse': [mse]
})], ignore_index=True)

# Display the results
results_df.head()


  results_df = pd.concat([results_df, pd.DataFrame({


Unnamed: 0,model_name,param,rmse,mse
0,DecisionTreeRegressor,{'max_depth': '10'},0.906551,0.821834


## 4.2 Random Forest <a id="section42"></a>

In [11]:
param_grid = [{'n_estimators': [100, 500], 'max_features': [10, 20, 50],'max_depth': [5, 10, 20]},]
RF_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(RF_reg, param_grid, cv=5,scoring='neg_mean_squared_error', return_train_score=True ,n_jobs=2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_depth': 20, 'max_features': 50, 'n_estimators': 500}

In [None]:
# 1. Train Decision Tree with best param
best_RF_reg = RandomForestRegressor(max_depth=20, max_features=50, n_estimators=500 ,n_jobs=2)
best_RF_reg.fit(X_train, y_train)

# 2. Predict on validation set
y_pred_val = best_RF_reg.predict(X_val)

# 3. Calculate MSE and RMSE
mse = mean_squared_error(y_val, y_pred_val)
rmse = np.sqrt(mse)

# 4. Save result
results_df = pd.concat([results_df, pd.DataFrame({
    'model_name': ['RandomForestRegressor'],
    'param': [{'max_depth': 20, 'max_features': 50, 'n_estimators': 500}],
    'rmse': [rmse],
    'mse': [mse]
})], ignore_index=True)

# Display the results
results_df.head()


In [13]:
results_df.to_csv("./Data/results.csv")

In [None]:
# Get feature importances
importances = best_RF_reg.feature_importances_

# If you have feature names (for example, from a DataFrame)
feature_names = X_train.columns  # assuming X_train is a pandas DataFrame

# Create a DataFrame for easier viewing
feature_importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort by importance descending
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display top features
feature_importances_df.head(20)  # Top 10 important features


Unnamed: 0,feature,importance
0,rating,0.378142
3,fan_count,0.082716
87,films_watched,0.077993
88,films_this_year,0.061854
86,watchlist_length,0.0614
90,following,0.057185
91,followers,0.055166
89,lists_created,0.037756
4,like_count,0.026994
6,list_count,0.019628


## 4.3 Ada Boost <a id="section43"></a>

In [24]:
# 1. Drop NaNs from X_train and y_train
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)

train_data = pd.concat([X_train, y_train.rename('target')], axis=1)
train_data = train_data.dropna()

X_train = train_data.drop('target', axis=1)
y_train = train_data['target']

# 2. Drop NaNs from X_val and y_val
X_val = pd.DataFrame(X_val)
y_val = pd.Series(y_val)

val_data = pd.concat([X_val, y_val.rename('target')], axis=1)
val_data = val_data.dropna()

X_val = val_data.drop('target', axis=1)
y_val = val_data['target']

# 3. Set up AdaBoost and parameter grid
ada_reg = AdaBoostRegressor(random_state=42)

param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'loss': ['linear', 'square', 'exponential']
}

# 4. Set up GridSearchCV
grid_search_ada = GridSearchCV(ada_reg, param_grid_ada, 
                               cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True,
                               n_jobs=2)

# 5. Fit the model
grid_search_ada.fit(X_train, y_train)

# 6. Best parameters
grid_search_ada.best_params_


{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 200}

In [26]:
# 1. Best model
best_model = grid_search_ada.best_estimator_

# 2. Predict on validation set
y_pred = best_model.predict(X_val)

# 3. Calculate MSE and RMSE
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

# 4. Save result
results_df = pd.concat([results_df, pd.DataFrame({
    'model_name': ['AdaBoostRegressor'],
    'param': [{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 200}],
    'rmse': [rmse],
    'mse': [mse]
})], ignore_index=True)

# Display the results
results_df.head()



Unnamed: 0,model_name,param,rmse,mse
0,DecisionTreeRegressor,{'max_depth': '10'},0.906551,0.821834
1,RandomForestRegressor,"{'max_depth': 20, 'max_features': 50, 'n_estim...",0.881634,0.777278
2,AdaBoostRegressor,"{'learning_rate': 0.01, 'loss': 'exponential',...",0.915916,0.838903


## 4.4 XGBoost <a id="section44"></a>

In [29]:
# 1. Set up XGBoost regressor
xgb_reg = XGBRegressor(random_state=42)

# 2. Define parameter grid
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 1.0]
}

# 3. Set up GridSearchCV
grid_search_xgb = GridSearchCV(
    xgb_reg,
    param_grid_xgb,
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True,
    n_jobs=2
)

# 4. Fit the model
grid_search_xgb.fit(X_train, y_train)

# 5. Best parameters
grid_search_xgb.best_params_


{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.7}

In [30]:
# 1. Best model
best_model = grid_search_xgb.best_estimator_

# 2. Predict on validation set
y_pred = best_model.predict(X_val)

# 3. Calculate MSE and RMSE
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

# 4. Save result
results_df = pd.concat([results_df, pd.DataFrame({
    'model_name': ['XGBoostRegressor'],
    'param': [{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.7}],
    'rmse': [rmse],
    'mse': [mse]
})], ignore_index=True)

# Display the results
results_df.head()



Unnamed: 0,model_name,param,rmse,mse
0,DecisionTreeRegressor,{'max_depth': '10'},0.906551,0.821834
1,RandomForestRegressor,"{'max_depth': 20, 'max_features': 50, 'n_estim...",0.881634,0.777278
2,AdaBoostRegressor,"{'learning_rate': 0.01, 'loss': 'exponential',...",0.915916,0.838903
3,XGBoostRegressor,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.858019,0.736197
