In [None]:
%%time
import sys
!cp -f ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
!cd -f /opt/conda/envs/ && tar -xzvf rapids.tar.gz
sys.path = ["/opt/conda/envs/rapids/lib"] + ["/opt/conda/envs/rapids/lib/python3.7"] + ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
!cp -f /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import os
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Regressors
import lightgbm as lgb

# Model selection
from sklearn.model_selection import KFold, StratifiedKFold

# Metrics
from sklearn.metrics import mean_squared_error

# Data processing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 
import cudf, cuml
import cupy as cp
from cuml.manifold import TSNE, UMAP
import matplotlib.pyplot as plt
from matplotlib.pyplot import ylim, xlim
%matplotlib inline

In [None]:
# Loading data 
X = pd.read_csv("train.csv")
X_test = pd.read_csv("test.csv")

In [None]:
# Extract numeric part from "Episode_Name" in both train and test
X['Episode_Number'] = X["Episode_Title"].str.extract('(\d+)').astype(float)
X_test['Episode_Number'] = X_test["Episode_Title"].str.extract('(\d+)').astype(float)

# (Optional) Drop the old column
X = X.drop(columns=["Episode_Title"])
X_test = X_test.drop(columns=["Episode_Title"])

In [None]:

from sklearn.base import BaseEstimator, TransformerMixin

class TargetEncode(BaseEstimator, TransformerMixin):
    def __init__(self, categories='auto', k=1, f=1, noise_level=0, random_state=None):
        if isinstance(categories, str) and categories != 'auto':
            self.categories = [categories]
        else:
            self.categories = categories
        self.k = k
        self.f = f
        self.noise_level = noise_level
        self.encodings = dict()
        self.prior = None
        self.random_state = random_state

    def add_noise(self, series, noise_level):
        return series * (1 + noise_level * np.random.randn(len(series)))

    def fit(self, X, y=None):
        if self.categories == 'auto':
            self.categories = np.where(X.dtypes == object)[0]
        temp = X.loc[:, self.categories].copy()
        temp['target'] = y
        self.prior = np.mean(y)

        for variable in self.categories:
            avg = temp.groupby(by=variable)['target'].agg(['mean', 'count'])
            # Compute smoothing
            smoothing = 1 / (1 + np.exp(-(avg['count'] - self.k) / self.f))
            # The bigger the count, the less full_avg is accounted
            self.encodings[variable] = dict(self.prior * (1 - smoothing) + avg['mean'] * smoothing)

        return self

    def transform(self, X):
        Xt = X.copy()
        for variable in self.categories:
            # Replace known categories
            Xt[variable] = Xt[variable].replace(self.encodings[variable])

            # Handle unknown categories
            unknown_value = {
                value: self.prior
                for value in X[variable].unique()
                if value not in self.encodings[variable].keys()
            }
            if len(unknown_value) > 0:
                Xt[variable] = Xt[variable].replace(unknown_value)

            Xt[variable] = Xt[variable].astype(float)

            # Optionally add noise
            if self.noise_level > 0:
                if self.random_state is not None:
                    np.random.seed(self.random_state)
                Xt[variable] = self.add_noise(Xt[variable], self.noise_level)

        return Xt

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

In [None]:
te = TargetEncode(categories=['Podcast_Name'], k=5, f=10, noise_level=0.01, random_state=42)

# Fit and transform training data
X_encoded = te.fit_transform(X[['Podcast_Name']], y)

# Transform test data
X_test_encoded = te.transform(X_test[['Podcast_Name']])

# Replace original categorical column with encoded one
X['Podcast_Name'] = X_encoded['Podcast_Name']
X_test['Podcast_Name'] = X_test_encoded['Podcast_Name']

print("Target encoding applied successfully!")

In [None]:
te = TargetEncode(categories=['Podcast_Name'], k=5, f=10, noise_level=0.01, random_state=42)

# Fit and transform training data
X_encoded = te.fit_transform(X[['Podcast_Name']], X['Listening_Time_minutes'])

# Transform test data
X_test_encoded = te.transform(X_test[['Podcast_Name']])

# Replace original categorical column with encoded one
X['Podcast_Name'] = X_encoded['Podcast_Name']
X_test['Podcast_Name'] = X_test_encoded['Podcast_Name']

print("Target encoding applied successfully!")

In [None]:
# Dealing with categorical data
categoricals = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
numeric = ['Podcast_Name', 'Episode_Number', 'Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',]
#Podcast_Name, Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
ordinal_encoder = OrdinalEncoder()
X[categoricals] = ordinal_encoder.fit_transform(X[categoricals]).astype(int)
X_test[categoricals] = ordinal_encoder.transform(X_test[categoricals]).astype(int)

In [None]:
ohe = OneHotEncoder(sparse=False)
H = np.hstack((ohe.fit_transform(X[categoricals]), X[numeric].values))

In [None]:
ohe = OneHotEncoder(sparse_output=False)
H = np.hstack((ohe.fit_transform(X[categoricals]), X[numeric].values))

In [None]:
tsne = TSNE(n_components=2, perplexity=10, n_neighbors=100)
projection_2D = tsne.fit_transform(H)

In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(projection_2D[:,0], projection_2D[:,1],
            c=y_stratified.values, 
            edgecolor='none', 
            alpha=0.80, 
            s=10)
plt.axis('off')
plt.show();

In [None]:
# 🔹 Use Sturges' rule to decide number of bins
bins = int(np.floor(1 + np.log2(len(X))))  # e.g., ~10 for 1000 samples

# 🔹 Create discrete proxy for y
y_stratified = pd.cut(y, bins=bins, labels=False)

In [None]:
# 🔹 Use Sturges' rule to decide number of bins
bins = int(np.floor(1 + np.log2(len(X))))  # e.g., ~10 for 1000 samples

# 🔹 Create discrete proxy for y
y_binned = pd.cut(X['Listening_Time_minutes'], bins=bins, labels=False)

In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(projection_2D[:,0], projection_2D[:,1],
            c=y_stratified.values, 
            edgecolor='none', 
            alpha=0.80, 
            s=10)
plt.axis('off')
plt.show();

In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(projection_2D[:,0], projection_2D[:,1],
            c=y_binned.values, 
            edgecolor='none', 
            alpha=0.80, 
            s=10)
plt.axis('off')
plt.show();

In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(projection_2D[:,0], projection_2D[:,1],
            c=y_binned.values, 
            edgecolor='none', 
            alpha=0.80, 
            s=10)
plt.axis('off')
plt.savefig("tsne_projection.png", dpi=300, bbox_inches='tight')
plt.show();

In [None]:

# Encode genre labels as integers for color mapping
genre_labels = LabelEncoder().fit_transform(X["Genre"])

plt.figure(figsize=(15, 15))
scatter = plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=genre_labels,
    cmap="tab10",  # or "tab20" for more categories
    edgecolor="none",
    alpha=0.8,
    s=10
)

# Add legend
handles, labels = scatter.legend_elements(prop="colors")
plt.legend(handles, LabelEncoder().classes_, title="Genre", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.axis("off")
plt.title("t-SNE Projection Colored by Genre", fontsize=14)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Encode genre labels as integers for color mapping
genre_labels = LabelEncoder().fit_transform(X["Genre"])

plt.figure(figsize=(15, 15))
scatter = plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=genre_labels,
    cmap="tab10",  # or "tab20" for more categories
    edgecolor="none",
    alpha=0.8,
    s=10
)

# Add legend
handles, labels = scatter.legend_elements(prop="colors")
plt.legend(handles, LabelEncoder().classes_, title="Genre", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.axis("off")
plt.title("t-SNE Projection Colored by Genre", fontsize=14)
plt.show()

In [None]:
print(projection_2D.shape)
print(np.min(projection_2D, axis=0))
print(np.max(projection_2D, axis=0))

In [None]:
# Fill numeric missing values with median
X_numeric_filled = X[numeric].fillna(X[numeric].median())


In [None]:
ohe = OneHotEncoder(sparse_output=False)  # sklearn >=1.4
H = np.hstack((ohe.fit_transform(X[categoricals]), X_numeric_filled.values))

In [None]:
tsne = TSNE(n_components=2, perplexity=10, n_neighbors=100)
projection_2D = tsne.fit_transform(H)

In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(projection_2D[:,0], projection_2D[:,1],
            c=y_binned.values, 
            edgecolor='none', 
            alpha=0.80, 
            s=10)
plt.axis('off')
plt.show();

In [None]:
# Encode genre labels as integers for color mapping
genre_labels = LabelEncoder().fit_transform(X["Genre"])

plt.figure(figsize=(15, 15))
scatter = plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=genre_labels,
    cmap="tab10",  # or "tab20" for more categories
    edgecolor="none",
    alpha=0.8,
    s=10
)

# Add legend
handles, labels = scatter.legend_elements(prop="colors")
plt.legend(handles, LabelEncoder().classes_, title="Genre", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.axis("off")
plt.title("t-SNE Projection Colored by Genre", fontsize=14)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Encode Publication_Day as integers (0 to 6)
day_labels = LabelEncoder().fit_transform(X["Publication_Day"])

plt.figure(figsize=(15, 15))
scatter = plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=day_labels,
    cmap="tab10",  # good for up to 10 categories
    edgecolor="none",
    alpha=0.8,
    s=10
)

# Add legend manually
handles, _ = scatter.legend_elements(prop="colors")
days = X["Publication_Day"].unique()
plt.legend(handles, days, title="Publication Day", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.axis("off")
plt.title("t-SNE Projection Colored by Publication Day", fontsize=14)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Encode Publication_Time as integers
time_labels = LabelEncoder().fit_transform(X["Publication_Time"])

plt.figure(figsize=(15, 15))
scatter = plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=time_labels,
    cmap="tab10",        # works well for up to 10 categories
    edgecolor="none",
    alpha=0.8,
    s=10
)

# Add legend
handles, _ = scatter.legend_elements(prop="colors")
times = X["Publication_Time"].unique()
plt.legend(handles, times, title="Publication Time", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.axis("off")
plt.title("t-SNE Projection Colored by Publication Time", fontsize=14)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Encode Episode_Sentiment as integers
sentiment_labels = LabelEncoder().fit_transform(X["Episode_Sentiment"])

plt.figure(figsize=(15, 15))
scatter = plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=sentiment_labels,
    cmap="Set1",         # good for few discrete categories
    edgecolor="none",
    alpha=0.8,
    s=10
)

# Add legend
handles, _ = scatter.legend_elements(prop="colors")
sentiments = X["Episode_Sentiment"].unique()
plt.legend(handles, sentiments, title="Episode Sentiment", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.axis("off")
plt.title("t-SNE Projection Colored by Episode Sentiment", fontsize=14)
plt.show()


In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=X["Podcast_Name"].values,  # numeric/continuous
    cmap="viridis",                       # continuous colormap
    edgecolor="none",
    alpha=0.8,
    s=10
)
plt.colorbar(label="Podcast Name (numeric)")
plt.axis("off")
plt.title("t-SNE Projection Colored by Podcast Name (continuous)", fontsize=14)
plt.show()


In [None]:
umap = UMAP(n_components=2, n_neighbors=50)
projection_2D = umap.fit_transform(H)

In [None]:
plt.figure(figsize=(15, 15))
plt.scatter(projection_2D[:,0], projection_2D[:,1],
            c=y_binned.values, 
            edgecolor='none', 
            alpha=0.80, 
            s=10)
plt.axis('off')
plt.show();

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Encode Genre as integers for coloring
genre_labels = LabelEncoder().fit_transform(X["Genre"])

plt.figure(figsize=(15, 15))
scatter = plt.scatter(
    projection_2D[:, 0],
    projection_2D[:, 1],
    c=genre_labels,         # color by genre
    cmap="tab20",            # good for multiple categories
    edgecolor="none",
    alpha=0.8,
    s=10
)

# Add legend
handles, _ = scatter.legend_elements(prop="colors")
genres = X["Genre"].unique()
plt.legend(handles, genres, title="Genre", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.axis("off")
plt.title("t-SNE Projection Colored by Genre", fontsize=14)
plt.show()


In [None]:
import os
import pandas as pd
from autoviz.AutoViz_Class import AutoViz_Class

# ✅ Create output folder
output_dir = "autoviz_results"
os.makedirs(output_dir, exist_ok=True)

# ✅ Initialize AutoViz
AV = AutoViz_Class()

# ✅ Target variable
target_variable = "Listening_Time_minutes"

# ✅ Run AutoViz on DataFrame
dft = AV.AutoViz(
    filename="",        # empty since we're passing DataFrame
    sep=",",
    depVar=target_variable,
    dfte=X,            # pass your DataFrame here
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=800000,
    max_cols_analyzed=30,
    save_plot_dir=output_dir
)

print(f"\n✅ All plots saved in: {os.path.abspath(output_dir)}")


In [None]:
%pip install autoviz

In [None]:
import os
import pandas as pd
from autoviz.AutoViz_Class import AutoViz_Class

# ✅ Create output folder
output_dir = "autoviz_results"
os.makedirs(output_dir, exist_ok=True)

# ✅ Initialize AutoViz
AV = AutoViz_Class()

# ✅ Target variable
target_variable = "Listening_Time_minutes"

# ✅ Run AutoViz on DataFrame
dft = AV.AutoViz(
    filename="",        # empty since we're passing DataFrame
    sep=",",
    depVar=target_variable,
    dfte=X,            # pass your DataFrame here
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=800000,
    max_cols_analyzed=30,
    save_plot_dir=output_dir
)

print(f"\n✅ All plots saved in: {os.path.abspath(output_dir)}")

In [None]:
import os
import pandas as pd
from autoviz.AutoViz_Class import AutoViz_Class

# ✅ Create output folder
output_dir = "autoviz_results"
os.makedirs(output_dir, exist_ok=True)

# ✅ Initialize AutoViz
AV = AutoViz_Class()

# ✅ Target variable
target_variable = "Listening_Time_minutes"

# ✅ Run AutoViz on DataFrame
dft = AV.AutoViz(
    filename="",        # empty since we're passing DataFrame
    sep=",",
    depVar=target_variable,
    dfte=X,            # pass your DataFrame here
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=800000,
    max_cols_analyzed=30,
    save_plot_dir=output_dir
)

print(f"\n✅ All plots saved in: {os.path.abspath(output_dir)}")

In [None]:
%pip install autoviz

from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

filename = r"C:\Users\Home\PycharmProjects\DSC-project\train.csv"
target_variable = "Listening_Time_minutes"

dft = AV.AutoViz(
    filename=filename,
    sep=",",
    depVar=target_variable,
    dfte=None,
    header=0,
    verbose=1,
    lowess=False,
    chart_format="svg",
    max_rows_analyzed=150000,
    max_cols_analyzed=30,
    save_plot_dir=None  # use "autoviz_plots" if you want to save
)