In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split # Importing train_test_split for splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
from xgboost import XGBRegressor

In [5]:
from sklearn.linear_model import LinearRegression,Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [6]:
df=pd.read_csv(r"C:/Users/KIRUBA/OneDrive/Desktop/youtube/youtube_ad_revenue_dataset.csv")


In [7]:
df.head(5)

Unnamed: 0,views,likes,comments,watch_time_minutes,subscribers,category,device,country,engagement_rate,ad_revenue_usd
0,9936,1221.0,320.0,26497.214184,228086,Entertainment,TV,IN,0.155093,203.178237
1,10017,642.0,346.0,15209.747445,736015,Gaming,Tablet,CA,0.098632,140.880508
2,10097,1979.0,187.0,57332.658498,240534,Education,TV,CA,0.214519,360.134008
3,10034,1191.0,242.0,31334.517771,434482,Entertainment,Mobile,UK,0.142814,224.638261
4,9889,1858.0,477.0,15665.666434,42030,Education,Mobile,CA,0.236121,165.514388


In [8]:
df.describe()

Unnamed: 0,views,likes,comments,watch_time_minutes,subscribers,engagement_rate,ad_revenue_usd
count,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0
mean,9999.832333,1044.605792,260.63235,37539.823485,502291.97005,0.13053,252.711361
std,99.918405,560.218383,139.884826,12658.95771,288364.967705,0.057661,61.954125
min,9521.0,0.0,0.0,14659.105562,1005.0,0.0,126.590603
25%,9933.0,578.0,145.0,26949.914101,252641.5,0.084071,199.892158
50%,10000.0,1054.0,262.0,37522.221205,503633.5,0.131243,252.678607
75%,10067.0,1524.0,381.0,48209.880123,752386.25,0.178496,305.613497
max,10468.0,2061.0,515.0,61557.670089,999997.0,0.249554,382.768254


#####  Encode Categorical Variables

In [9]:
# 1. Encode categorical features
encoder_onehot = OneHotEncoder(drop='first', sparse_output=False) # drop='first' to avoid dummy variable trap / # or sparse=True for large data
encoder_onehot = encoder_onehot.fit_transform(df[['category', 'device', 'country']]) # fit and transform categorical columns
encoder_onehot

array([[1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(120000, 13))

In [10]:
# 2. Define features and target
X=df.drop('ad_revenue_usd',axis=1) # independent variable / Features / X
Y=df['ad_revenue_usd'] # dependent variable / Target/Label / y

In [11]:
# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

### Feature Scaling

In [12]:
# Define columns based on scaling strategy
minmax_cols = ["watch_time_minutes"]        # wide range → MinMaxScaler
robust_cols = ["subscribers"]               # heavy outliers → RobustScaler
standard_cols = ["likes", "comments"]       # normal-like → StandardScaler
# engagement_rate already small (0–0.25), no scaling needed

# Initialize scalers
scaler_minmax = MinMaxScaler()
scaler_robust = RobustScaler()
scaler_standard = StandardScaler()

# Work on copies (avoid modifying original data)
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Apply RobustScaler
X_train_scaled[robust_cols] = scaler_robust.fit_transform(X_train[robust_cols])
X_test_scaled[robust_cols] = scaler_robust.transform(X_test[robust_cols])

# Apply MinMaxScaler
X_train_scaled[minmax_cols] = scaler_minmax.fit_transform(X_train[minmax_cols])
X_test_scaled[minmax_cols] = scaler_minmax.transform(X_test[minmax_cols])

# Apply StandardScaler
X_train_scaled[standard_cols] = scaler_standard.fit_transform(X_train[standard_cols])
X_test_scaled[standard_cols] = scaler_standard.transform(X_test[standard_cols])

# Identify remaining columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.difference(
    robust_cols + minmax_cols + standard_cols
)
categorical_cols = X.select_dtypes(include=['object']).columns

#### Preprocessing with ColumnTransformer

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_standard', StandardScaler(), numeric_cols),
        ('num_robust', RobustScaler(), robust_cols),
        ('num_minmax', MinMaxScaler(), minmax_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)

### LinearRegression

In [14]:
pipe_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

pipe_lr.fit(X_train, y_train)

y_train_pred_lr = pipe_lr.predict(X_train)
y_test_pred_lr = pipe_lr.predict(X_test)  

print('LinearRegression Evaluation')
print('Train r2_score:',r2_score(y_train,y_train_pred_lr))
print('Test r2_score:',r2_score(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_squared_error:',mean_squared_error(y_train,y_train_pred_lr))
print('Test mean_squared_error:',mean_squared_error(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_absolute_error:',mean_absolute_error(y_train,y_train_pred_lr))
print('Test mean_absolute_error:',mean_absolute_error(y_test,y_test_pred_lr))
print('---'*10)

LinearRegression Evaluation
Train r2_score: 0.9457120317378575
Test r2_score: 0.9481905947210313
------------------------------
Train mean_squared_error: 208.1174516744817
Test mean_squared_error: 199.42345864002453
------------------------------
Train mean_absolute_error: 4.967561983000418
Test mean_absolute_error: 4.811127662194396
------------------------------


###polynomial

In [15]:
from sklearn.feature_selection import SelectKBest, f_regression

pipe_lr_poly = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('select', SelectKBest(score_func=f_regression, k=100)),  # keep best 100 features
    ('model', LinearRegression())
])

pipe_lr_poly.fit(X_train, y_train)

y_train_pred_lr = pipe_lr_poly.predict(X_train)
y_test_pred_lr = pipe_lr_poly.predict(X_test)  

print('PolynomialFeatures Evaluation')
print('Train r2_score:',r2_score(y_train,y_train_pred_lr))
print('Test r2_score:',r2_score(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_squared_error:',mean_squared_error(y_train,y_train_pred_lr))
print('Test mean_squared_error:',mean_squared_error(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_absolute_error:',mean_absolute_error(y_train,y_train_pred_lr))
print('Test mean_absolute_error:',mean_absolute_error(y_test,y_test_pred_lr))
print('---'*10)
#3s

PolynomialFeatures Evaluation
Train r2_score: 0.9468890561222526
Test r2_score: 0.9492270921938113
------------------------------
Train mean_squared_error: 203.60523058239343
Test mean_squared_error: 195.43379865878288
------------------------------
Train mean_absolute_error: 4.767458900923363
Test mean_absolute_error: 4.629087664099878
------------------------------



##### Lasso Regression (L1)

In [16]:
pipe_lasso = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Lasso(alpha=0.1, random_state=42))
])

pipe_lasso.fit(X_train, y_train)

y_train_pred_lasso = pipe_lasso.predict(X_train)
y_test_pred_lasso = pipe_lasso.predict(X_test)

print("Lasso Regression Evaluation")
print('Train r2_score:', r2_score(y_train, y_train_pred_lasso))
print('Test r2_score:', r2_score(y_test, y_test_pred_lasso))
print('---'*10)
print('Train MSE:', mean_squared_error(y_train, y_train_pred_lasso))
print('Test MSE:', mean_squared_error(y_test, y_test_pred_lasso))
print('---'*10)
print('Train MAE:', mean_absolute_error(y_train, y_train_pred_lasso))
print('Test MAE:', mean_absolute_error(y_test, y_test_pred_lasso))
print('---'*10)
#4s

Lasso Regression Evaluation
Train r2_score: 0.9456676211249159
Test r2_score: 0.9481540635962711
------------------------------
Train MSE: 208.2877034611773
Test MSE: 199.5640733258806
------------------------------
Train MAE: 5.036362934325149
Test MAE: 4.880260739291318
------------------------------


##### Ridge Regression (L2)

In [17]:
pipe_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge(alpha=1.0, random_state=42))
])

pipe_ridge.fit(X_train, y_train)

y_train_pred_ridge = pipe_ridge.predict(X_train)
y_test_pred_ridge = pipe_ridge.predict(X_test)

print("Ridge Regression Evaluation")
print('Train r2_score:', r2_score(y_train, y_train_pred_ridge))
print('Test r2_score:', r2_score(y_test, y_test_pred_ridge))
print('---'*10)
print('Train MSE:', mean_squared_error(y_train, y_train_pred_ridge))
print('Test MSE:', mean_squared_error(y_test, y_test_pred_ridge))
print('---'*10)
print('Train MAE:', mean_absolute_error(y_train, y_train_pred_ridge))
print('Test MAE:', mean_absolute_error(y_test, y_test_pred_ridge))
print('---'*10)
#4s

Ridge Regression Evaluation
Train r2_score: 0.9457120069337913
Test r2_score: 0.9481903885285918
------------------------------
Train MSE: 208.1175467629368
Test MSE: 199.42425231077206
------------------------------
Train MAE: 4.967631840440615
Test MAE: 4.811246329309589
------------------------------


###### Elastic Net (L1 + L2)

In [18]:
pipe_elastic = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42))
])

pipe_elastic.fit(X_train, y_train)

y_train_pred_elastic = pipe_elastic.predict(X_train)
y_test_pred_elastic = pipe_elastic.predict(X_test)

print("Elastic Net Regression Evaluation")
print('Train r2_score:', r2_score(y_train, y_train_pred_elastic))
print('Test r2_score:', r2_score(y_test, y_test_pred_elastic))
print('---'*10)
print('Train MSE:', mean_squared_error(y_train, y_train_pred_elastic))
print('Test MSE:', mean_squared_error(y_test, y_test_pred_elastic))
print('---'*10)
print('Train MAE:', mean_absolute_error(y_train, y_train_pred_elastic))
print('Test MAE:', mean_absolute_error(y_test, y_test_pred_elastic))
print('---'*10)
#4s

Elastic Net Regression Evaluation
Train r2_score: 0.790574548812681
Test r2_score: 0.7921442520376601
------------------------------
Train MSE: 802.8499244330206
Test MSE: 800.0731128578626
------------------------------
Train MAE: 23.464689506351117
Test MAE: 23.422626567002936
------------------------------


## RandomForestRegressor

In [19]:
pipe_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200,max_depth=10,max_features='sqrt', random_state=42))
])

pipe_rf.fit(X_train, y_train)

y_train_pred_rf = pipe_rf.predict(X_train)
y_test_pred_rf = pipe_rf.predict(X_test)

print('RandomForestRegressor Evaluation')
print('Train r2_score:',r2_score(y_train,y_train_pred_lr))
print('Test r2_score:',r2_score(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_squared_error:',mean_squared_error(y_train,y_train_pred_lr))
print('Test mean_squared_error:',mean_squared_error(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_absolute_error:',mean_absolute_error(y_train,y_train_pred_lr))
print('Test mean_absolute_error:',mean_absolute_error(y_test,y_test_pred_lr))
print('---'*10)
#2m

RandomForestRegressor Evaluation
Train r2_score: 0.9468890561222526
Test r2_score: 0.9492270921938113
------------------------------
Train mean_squared_error: 203.60523058239343
Test mean_squared_error: 195.43379865878288
------------------------------
Train mean_absolute_error: 4.767458900923363
Test mean_absolute_error: 4.629087664099878
------------------------------


## GradientBoostingRegressor

In [20]:
pipe_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

pipe_gb.fit(X_train, y_train)

y_train_pred_gb = pipe_gb.predict(X_train)
y_test_pred_gb = pipe_gb.predict(X_test)

print('GradientBoostingRegressor Evaluation')
print('Train r2_score:',r2_score(y_train,y_train_pred_lr))
print('Test r2_score:',r2_score(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_squared_error:',mean_squared_error(y_train,y_train_pred_lr))
print('Test mean_squared_error:',mean_squared_error(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_absolute_error:',mean_absolute_error(y_train,y_train_pred_lr))
print('Test mean_absolute_error:',mean_absolute_error(y_test,y_test_pred_lr))
print('---'*10)
#49s

GradientBoostingRegressor Evaluation
Train r2_score: 0.9468890561222526
Test r2_score: 0.9492270921938113
------------------------------
Train mean_squared_error: 203.60523058239343
Test mean_squared_error: 195.43379865878288
------------------------------
Train mean_absolute_error: 4.767458900923363
Test mean_absolute_error: 4.629087664099878
------------------------------


## XGBRegressor

In [21]:
pipe_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse'))
])

pipe_xgb.fit(X_train, y_train)

y_train_pred_xgb = pipe_xgb.predict(X_train)
y_test_pred_xgb = pipe_xgb.predict(X_test)

print('XGBRegressor Evaluation')
print('Train r2_score:',r2_score(y_train,y_train_pred_lr))
print('Test r2_score:',r2_score(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_squared_error:',mean_squared_error(y_train,y_train_pred_lr))
print('Test mean_squared_error:',mean_squared_error(y_test,y_test_pred_lr))
print('---'*10)
print('Train mean_absolute_error:',mean_absolute_error(y_train,y_train_pred_lr))
print('Test mean_absolute_error:',mean_absolute_error(y_test,y_test_pred_lr))
print('---'*10)
# 2.2s

XGBRegressor Evaluation
Train r2_score: 0.9468890561222526
Test r2_score: 0.9492270921938113
------------------------------
Train mean_squared_error: 203.60523058239343
Test mean_squared_error: 195.43379865878288
------------------------------
Train mean_absolute_error: 4.767458900923363
Test mean_absolute_error: 4.629087664099878
------------------------------


## SVR

In [22]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

pipe_svr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR(kernel='rbf'))
])

# Train
pipe_svr.fit(X_train, y_train)

# Predictions
y_train_pred_svr = pipe_svr.predict(X_train)
y_test_pred_svr = pipe_svr.predict(X_test)

# Evaluation
print('SVR Evaluation')
print('Train r2_score:', r2_score(y_train, y_train_pred_svr))
print('Test r2_score:', r2_score(y_test, y_test_pred_svr))
print('---'*10)
print('Train mean_squared_error:', mean_squared_error(y_train, y_train_pred_svr))
print('Test mean_squared_error:', mean_squared_error(y_test, y_test_pred_svr))
print('---'*10)
print('Train mean_absolute_error:', mean_absolute_error(y_train, y_train_pred_svr))
print('Test mean_absolute_error:', mean_absolute_error(y_test, y_test_pred_svr))
print('---'*10)


SVR Evaluation
Train r2_score: 0.9406527641303316
Test r2_score: 0.9426389661201056
------------------------------
Train mean_squared_error: 227.51257577883655
Test mean_squared_error: 220.7926477036734
------------------------------
Train mean_absolute_error: 5.950567762821795
Test mean_absolute_error: 5.880245543456783
------------------------------


In [23]:
import pickle
pickle.dump(pipe_lr, open("pipe_lr.pkl", "wb"))
pickle.dump(pipe_lr_poly, open("pipe_lr_poly.pkl", "wb"))
pickle.dump(pipe_lasso, open("pipe_lasso.pkl", "wb"))
pickle.dump(pipe_ridge, open("pipe_ridge.pkl", "wb"))
pickle.dump(pipe_elastic, open("pipe_elastic.pkl", "wb"))
pickle.dump(pipe_rf, open("pipe_rf.pkl", "wb"))
pickle.dump(pipe_gb, open("pipe_gb.pkl", "wb"))
pickle.dump(pipe_xgb , open("pipe_xgb .pkl", "wb"))
pickle.dump(preprocessor , open("preprocessor .pkl", "wb"))

In [24]:
pickle.dump(pipe_svr , open("pipe_svr .pkl", "wb"))