# Setup

In [2]:
import numpy as np
import pandas as pd 
import sklearn

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


# Get the data

In [21]:
train_set = pd.read_csv('/kaggle/input/playground-series-s5e4/train.csv')
test_set = pd.read_csv('/kaggle/input/playground-series-s5e4/test.csv')

# Take a quick look at the data structure

In [22]:
print(f'Train set len: {len(train_set)}')

Train set len: 750000


In [23]:
print(f'Test set len: {len(test_set)}')

Test set len: 250000


In [26]:
print(train_set.isnull().sum()) 
print(train_set.isnull().values.any())  

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64
True


# Prepare the data

In [27]:
def fill_missing_values(df):
    return df.fillna({
        "Episode_Length_minutes": df["Episode_Length_minutes"].median(),
        "Guest_Popularity_percentage": df["Guest_Popularity_percentage"].median()
    })

In [28]:
train_set = fill_missing_values(train_set)
test_set = fill_missing_values(test_set)

In [30]:
train_set.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,63.84,True Crime,74.81,Thursday,Night,53.58,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


# Handling text and categorical attributes

In [44]:
from sklearn.preprocessing import OrdinalEncoder

train_podcast_name, train_episode_title, train_genre, \
train_publication_day, train_publication_time, \
train_episode_sentiment = train_set.loc[:, [
    "Podcast_Name", "Episode_Title", "Genre", "Publication_Day",
    "Publication_Time", "Episode_Sentiment"
]].T.values

ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
train_podcast_name_encoded = ordinal_encoder.fit_transform(train_podcast_name.reshape(-1,1))
train_episode_title_encoded = ordinal_encoder.fit_transform(train_episode_title.reshape(-1,1))
train_genre_encoded = ordinal_encoder.fit_transform(train_genre.reshape(-1,1))
train_publication_day_encoded = ordinal_encoder.fit_transform(train_publication_day.reshape(-1,1))
train_publication_time_encoded = ordinal_encoder.fit_transform(train_publication_time.reshape(-1,1))
train_episode_sentiment_encoded = ordinal_encoder.fit_transform(train_episode_sentiment.reshape(-1,1))

In [45]:
test_podcast_name, test_episode_title, test_genre, \
test_publication_day, test_publication_time, \
test_episode_sentiment = test_set.loc[:, [
    "Podcast_Name", "Episode_Title", "Genre", "Publication_Day",
    "Publication_Time", "Episode_Sentiment"
]].T.values

test_podcast_name_encoded = ordinal_encoder.transform(test_podcast_name.reshape(-1,1))
test_episode_title_encoded = ordinal_encoder.transform(test_episode_title.reshape(-1,1))
test_genre_encoded = ordinal_encoder.transform(test_genre.reshape(-1,1))
test_publication_day_encoded = ordinal_encoder.transform(test_publication_day.reshape(-1,1))
test_publication_time_encoded = ordinal_encoder.transform(test_publication_time.reshape(-1,1))
test_episode_sentiment_encoded = ordinal_encoder.transform(test_episode_sentiment.reshape(-1,1))

# Transformer

In [54]:
from sklearn.base import BaseEstimator, TransformerMixin

class FillMissingValuesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self;

    def transform(self, X):
        return X.fillna({
            "Episode_Length_minutes": X["Episode_Length_minutes"].median(),
            "Guest_Popularity_percentage": X["Guest_Popularity_percentage"].median()
        })

# Transformation Pipelines

In [74]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

train_num = train_set.drop("Listening_Time_minutes", axis=1).select_dtypes(include=[np.number])
test_num = test_set.select_dtypes(include=[np.number])

num_pipeline = Pipeline([
    ('fill_missing_values', FillMissingValuesTransformer()),
    ('std_scaler', StandardScaler()),
])
train_num_tr = num_pipeline.fit_transform(train_num)
test_num_tr = num_pipeline.transform(test_num)

In [75]:
from sklearn.preprocessing import OneHotEncoder

text_pipeline = Pipeline([
    ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [76]:
from sklearn.compose import ColumnTransformer

train_num_attribs = list(train_num)
train_text_attribs = ["Podcast_Name", "Episode_Title", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, train_num_attribs),
    ("text", text_pipeline, train_text_attribs)
])

train_set_prepared = full_pipeline.fit_transform(train_set)
test_set_prepared = full_pipeline.transform(test_set)

In [77]:
train_set_prepared.shape

(750000, 177)

In [78]:
train_set_labels = train_set["Listening_Time_minutes"].copy()
#train_set_features = train_set.drop("Listening_Time_minutes", axis=1)

In [15]:
X_train = train_set.iloc[:, :-1].values
y_train = train_set.iloc[:, -1].values
X_test = test_set.iloc[:, :-1].values
y_test = test_set.iloc[:, -1].values

In [84]:
# pd.DataFrame(train_set_prepared).isnull().sum()
# pd.isnull(train_set_labels).sum()
# print(train_set_prepared.shape)
# print(train_set_labels.shape)
print(type(train_set_prepared))
print(train_set_prepared.dtype)

<class 'scipy.sparse._csr.csr_matrix'>
float64


In [92]:
from sklearn.impute import SimpleImputer

train_set_prepared_dense = train_set_prepared.toarray()
test_set_prepared_dense = test_set_prepared.toarray()

imputer = SimpleImputer(strategy='mean')

train_set_prepared_dense_imputed = imputer.fit_transform(train_set_prepared_dense)
test_set_prepared_dense_imputed = imputer.transform(test_set_prepared_dense)

#  Select and Train a Model

In [93]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_set_prepared_dense_imputed, train_set_labels)

predictions = model.predict(test_set_prepared_dense_imputed)

submission_df = pd.DataFrame({
    'id': test_set['id'],
    'Listening_Time_minutes': predictions
})

submission_df.to_csv('submission.csv', index=False)

In [94]:
submission_df.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,55.039062
1,750001,19.166016
2,750002,51.306641
3,750003,81.054688
4,750004,50.210938


In [101]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

test_labels = submission_df["Listening_Time_minutes"].copy()

mae = mean_absolute_error(test_labels, predictions)
mse = mean_squared_error(test_labels, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(test_labels, predictions)

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R2: ", r2)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
MAE:  0.0
MSE:  0.0
RMSE:  0.0
R2:  1.0
