In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

df = pd.read_csv('videos.csv')

In [65]:
X = df.drop('likeCount', axis=1)
y = df['likeCount']

from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
from sklearn.base import BaseEstimator, TransformerMixin
from textblob import TextBlob

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        #super().__init__()
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        date = pd.to_datetime(X['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ') #modify date data type

        hours = pd.to_numeric(X['duration'].str.extract('(\d+H)').squeeze().str[:-1].fillna(0))
        mins = pd.to_numeric(X['duration'].str.extract('(\d+M)').squeeze().str[:-1].fillna(0))
        seconds = pd.to_numeric(X['duration'].str.extract('(\d+S)').squeeze().str[:-1].fillna(0)) + hours * 3600 + mins * 60

        title = X['title'].apply(lambda x: TextBlob(x).sentiment)
        tags = X['tags'].str.join(' ').apply(lambda x: TextBlob(x).sentiment)

        return np.c_[date.dt.year,
                     date.dt.month,
                     date.dt.day,
                     date.dt.hour,
                     date.dt.minute,
                     date.dt.dayofweek,
                     date.dt.dayofyear,
                     seconds,
                     title.str[0],
                     title.str[1],
                     tags.str[0],
                     tags.str[1],
                     X[['viewCount', 'commentCount']]]

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, RobustScaler

full_pipeline = Pipeline([
    ('featureEngineer', FeatureEngineer()),
    #('quantileTransform', QuantileTransformer(output_distribution='normal', random_state=0))
    ('robustScaler', RobustScaler())
])

X_train_prepared = full_pipeline.fit_transform(X_train)

In [69]:
from sklearn.model_selection import cross_validate, cross_val_score

def rmsle_cv(model):
    return np.sqrt(-cross_val_score(model, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv = 5))

def r_sqr(model):
    result = cross_validate(model, X_train_prepared, y_train)
    print('mean:', result['test_score'].mean(), '| std:', result['test_score'].std())

from sklearn.linear_model import LinearRegression, ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

r_sqr(LinearRegression())
r_sqr(Lasso(alpha =0.0005, random_state=1))
r_sqr(ElasticNet())



mean: 0.9242912748256902 | std: 0.041219611694107275
mean: 0.9240930896225666 | std: 0.040917908896351524
mean: 0.9247444400926819 | std: 0.04674505859883021


In [70]:
lr = LinearRegression().fit(X_train_prepared, y_train)
lr.score(full_pipeline.transform(X_test), y_test)

0.8534868746569868