In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score
import plotly.express as px
import numpy as np

In [2]:
import dagshub
import mlflow
dagshub.init(repo_owner='pankaj-2708', repo_name='You-tube-Comment-analysis', mlflow=True)

In [12]:
mlflow.set_experiment("jupyter-lab")

2025/09/30 18:02:19 INFO mlflow.tracking.fluent: Experiment with name 'jupyter-lab' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/c4c3e8dd03694821a998d8d0b10c59d4', creation_time=1759235536530, experiment_id='1', last_update_time=1759235536530, lifecycle_stage='active', name='jupyter-lab', tags={}>

In [2]:
train=pd.read_csv("../data/transformed/train.csv")
test=pd.read_csv("../data/transformed/test.csv")
X=train.drop(columns=['lb__Sentiment'])
y=train['lb__Sentiment']
test_X=test.drop(columns=['lb__Sentiment'])
test_y=test['lb__Sentiment']

In [3]:
# Imbalanced data handling

In [15]:
des=DecisionTreeClassifier(max_depth=6)
des.fit(X,y)
y_pred=des.predict(test_X)
accuracy_score(y_pred,test_y)

0.6643356643356644

In [5]:
# from dtreeviz.trees import *
# from IPython.display import Image, display_svg, SVG
# import graphviz.backend as be

# viz = dtreeviz(clas, 
#                X,
#                y,
#                feature_names=X.feature_names, 
#                class_names=['negative', 'neutral', 'positive'])
# viz

In [6]:
import plotly.io as pio
pio.renderers.default = "notebook" 

In [11]:
feature_imp=pd.DataFrame(np.hstack((des.feature_importances_.reshape(len(des.feature_names_in_),1),des.feature_names_in_.reshape(len(des.feature_names_in_),1))),columns=['importance','name'])
feature_imp_sorted=feature_imp.sort_values(by="importance",ascending=False)
feature_imp_sorted=feature_imp_sorted.head(10)
px.bar(x=feature_imp_sorted["importance"]*100,y=feature_imp_sorted['name'],orientation='h')

In [46]:
feature_imp_sorted

Unnamed: 0,importance,name
5,0.547121,std__PositiveWordCount
0,0.131541,std__comment_len
4,0.097684,std__stopword_count
255,0.074302,remainder__like
6,0.050424,std__NegativeWordCount
187,0.041458,remainder__good
33,0.019768,remainder__appreci
441,0.012115,remainder__thank
305,0.004796,remainder__never
129,0.004272,remainder__enjoy


In [17]:
with mlflow.start_run():
    mlflow.log_param("model", "descison tree")
    mlflow.log_param("max_depth", "6")
    model_ = DecisionTreeClassifier(max_depth=6)

    model_.fit(X, y)
    pred_y = model_.predict(test_X)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))

🏃 View run angry-lamb-868 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/554dc7a53344407aadc60f19fd4a8610
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [32]:
# PCA
from sklearn.decomposition import PCA
pc=PCA(n_components=10)
pc.fit(X)
pc.explained_variance_ratio_.sum()
X_pc=pc.transform(X)
X_pc_test=pc.transform(test_X)

In [33]:
des=DecisionTreeClassifier(max_depth=6)
des.fit(X_pc,y)
y_pred=des.predict(X_pc_test)
accuracy_score(y_pred,test_y)

0.6763636363636364

In [34]:
with mlflow.start_run():
    mlflow.log_param("model", "descison tree")
    mlflow.log_param("max_depth", "6")
    mlflow.log_param("pca", "True")
    model_ = DecisionTreeClassifier(max_depth=6)

    model_.fit(X_pc, y)
    pred_y = model_.predict(X_pc_test)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))

🏃 View run abundant-stork-340 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/8e6f8b3afdb84c359f437f497fe76002
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [45]:
des=RandomForestClassifier(max_depth=6,n_estimators=50,random_state=42,max_features=0.7,max_samples=0.7)
des.fit(X,y)
y_pred=des.predict(test_X)
accuracy_score(y_pred,test_y)

0.6732867132867133

In [46]:
feature_imp=pd.DataFrame(np.hstack((des.feature_importances_.reshape(len(des.feature_names_in_),1),des.feature_names_in_.reshape(len(des.feature_names_in_),1))),columns=['importance','name'])
feature_imp_sorted=feature_imp.sort_values(by="importance",ascending=False)
feature_imp_sorted=feature_imp_sorted.head(10)
px.bar(x=feature_imp_sorted["importance"]*100,y=feature_imp_sorted['name'],orientation='h')

In [52]:
with mlflow.start_run():
    mlflow.log_param("model", "random forest")
    model_ = RandomForestClassifier(max_depth=6,n_estimators=50,random_state=42,max_features=0.7,max_samples=0.7)
    param=model_.get_params()
    model_.fit(X, y)
    for i in range(len(param)):
        mlflow.log_param(list(param.keys())[i],list(param.values())[i])

    pred_y = model_.predict(test_X)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))

🏃 View run righteous-lamb-44 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/867f270eea0741729bd7127c307f054b
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [54]:
des=RandomForestClassifier(max_depth=6,n_estimators=50,random_state=42,max_features=0.7,max_samples=0.7)
des.fit(X_pc,y)
y_pred=des.predict(X_pc_test)
y_pred_train=des.predict(X_pc)
accuracy_score(y_pred,test_y),accuracy_score(y_pred_train,y)

(0.6833566433566434, 0.69499230661631)

In [64]:
from imblearn.ensemble import BalancedRandomForestClassifier
des=BalancedRandomForestClassifier(max_depth=8,n_estimators=40)
des.fit(X,y)
y_pred=des.predict(test_X)
y_pred_train=des.predict(X)
accuracy_score(y_pred,test_y),accuracy_score(y_pred_train,y)

(0.6173426573426574, 0.6349839138341027)

In [74]:
with mlflow.start_run():
    mlflow.log_param("model", "balanced random forest")
    model_ = BalancedRandomForestClassifier(max_depth=8,n_estimators=40)
    param=model_.get_params()
    model_.fit(X, y)
    for i in range(len(param)):
        mlflow.log_param(list(param.keys())[i],list(param.values())[i])

    pred_y = model_.predict(test_X)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))

🏃 View run vaunted-elk-244 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/609cfedf496945cf990c28c4ca921250
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [67]:
des=GradientBoostingClassifier(max_depth=5,n_estimators=50)
des.fit(X,y)
y_pred=des.predict(test_X)
y_pred_train=des.predict(X)
accuracy_score(y_pred,test_y),accuracy_score(y_pred_train,y)

(0.7026573426573427, 0.7485662330395859)

In [68]:
feature_imp=pd.DataFrame(np.hstack((des.feature_importances_.reshape(len(des.feature_names_in_),1),des.feature_names_in_.reshape(len(des.feature_names_in_),1))),columns=['importance','name'])
feature_imp_sorted=feature_imp.sort_values(by="importance",ascending=False)
feature_imp_sorted=feature_imp_sorted.head(10)
px.bar(x=feature_imp_sorted["importance"]*100,y=feature_imp_sorted['name'],orientation='h')

In [70]:
with mlflow.start_run():
    mlflow.log_param("model", "gradient boosting")
    model_ = GradientBoostingClassifier(max_depth=5,n_estimators=50)
    param=model_.get_params()
    model_.fit(X, y)
    for i in range(len(param)):
        mlflow.log_param(list(param.keys())[i],list(param.values())[i])

    pred_y = model_.predict(test_X)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))

🏃 View run welcoming-mink-569 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/be089d1503644232b7d72a9170efed6d
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [69]:
des=GradientBoostingClassifier(max_depth=5,n_estimators=50)
des.fit(X_pc,y)
y_pred=des.predict(X_pc_test)
y_pred_train=des.predict(X_pc)
accuracy_score(y_pred,test_y),accuracy_score(y_pred_train,y)

(0.6847552447552447, 0.7393341726115541)

In [71]:
with mlflow.start_run():
    mlflow.log_param("model", "gradient boosting")
    mlflow.log_param("pca", "True")
    model_ = GradientBoostingClassifier(max_depth=5,n_estimators=50)
    param=model_.get_params()
    model_.fit(X_pc, y)
    for i in range(len(param)):
        mlflow.log_param(list(param.keys())[i],list(param.values())[i])

    pred_y = model_.predict(X_pc_test)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))

🏃 View run legendary-lark-276 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/f2692d9f938c415eb14be7a855e694f5
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [84]:
from xgboost import XGBClassifier
des=XGBClassifier(max_depth=5,n_estimators=40,subsample=0.7,colsample_bytree=0.7)
des.fit(X,y)
y_pred=des.predict(test_X)
y_pred_train=des.predict(X)
accuracy_score(y_pred,test_y),accuracy_score(y_pred_train,y)

(0.7197202797202797, 0.7652818576024619)

In [97]:
feature_imp=pd.DataFrame(np.hstack((np.array([[round(i[0],4)] for i in des.feature_importances_.reshape(len(des.feature_names_in_),1)]),des.feature_names_in_.reshape(len(des.feature_names_in_),1))),columns=['importance','name'])
feature_imp_sorted=feature_imp.sort_values(by="importance",ascending=False)
feature_imp_sorted=feature_imp_sorted.head(10)
px.bar(x=feature_imp_sorted["importance"]*100,y=feature_imp_sorted['name'],orientation='h')

In [85]:
with mlflow.start_run():
    mlflow.log_param("model", "xgboost")
    model_ =XGBClassifier(max_depth=5,n_estimators=40,subsample=0.7,colsample_bytree=0.7)
    param=model_.get_params()
    model_.fit(X, y)
    for i in range(len(param)):
        mlflow.log_param(list(param.keys())[i],list(param.values())[i])

    pred_y = model_.predict(test_X)
    pred_y_train = model_.predict(X)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))
    mlflow.log_metric("accuracy_train", accuracy_score(y, pred_y_train))

🏃 View run respected-mouse-261 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/cd91286c6f574f33a0bcacdf31da8ddf
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [100]:
with mlflow.start_run():
    mlflow.log_param("model", "xgboost")
    mlflow.log_param("pca", "True")
    model_ =XGBClassifier(max_depth=5,n_estimators=40,subsample=0.7,colsample_bytree=0.7)
    param=model_.get_params()
    model_.fit(X_pc, y)
    for i in range(len(param)):
        mlflow.log_param(list(param.keys())[i],list(param.values())[i])

    pred_y = model_.predict(X_pc_test)
    pred_y_train = model_.predict(X_pc)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))
    mlflow.log_metric("accuracy_train", accuracy_score(y, pred_y_train))

🏃 View run agreeable-koi-547 at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1/runs/0d424797d4b34005ade74e1eccbce56f
🧪 View experiment at: https://dagshub.com/pankaj-2708/You-tube-Comment-analysis.mlflow/#/experiments/1


In [101]:
des=SVC()
des.fit(X,y)
y_pred=des.predict(test_X)
y_pred_train=des.predict(X)
accuracy_score(y_pred,test_y),accuracy_score(y_pred_train,y)

(0.7227972027972028, 0.8151489718841796)

In [None]:
with mlflow.start_run():
    mlflow.log_param("model", "SVC")
    model_ =SVC()
    param=model_.get_params()
    model_.fit(X, y)
    for i in range(len(param)):
        mlflow.log_param(list(param.keys())[i],list(param.values())[i])

    pred_y = model_.predict(test_X)
    pred_y_train = model_.predict(X)

    mlflow.log_metric("accuracy", accuracy_score(test_y, pred_y))
    mlflow.log_metric("accuracy_train", accuracy_score(y, pred_y_train))

In [None]:
des=Li()
des.fit(X,y)
y_pred=des.predict(test_X)
y_pred_train=des.predict(X)
accuracy_score(y_pred,test_y),accuracy_score(y_pred_train,y)