In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('videos.csv')

In [101]:
X = df.drop('likeCount', axis=1)
y = df['likeCount']

from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [143]:
from sklearn.base import BaseEstimator, TransformerMixin
from textblob import TextBlob

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        #super().__init__()
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        date = pd.to_datetime(X['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ') #modify date data type

        hours = pd.to_numeric(X['duration'].str.extract('(\d+H)').squeeze().str[:-1].fillna(0))
        mins = pd.to_numeric(X['duration'].str.extract('(\d+M)').squeeze().str[:-1].fillna(0))
        seconds = pd.to_numeric(X['duration'].str.extract('(\d+S)').squeeze().str[:-1].fillna(0)) + hours * 3600 + mins * 60

        title = X['title'].apply(lambda x: TextBlob(x).sentiment)
        tags = X['tags'].str.join(' ').apply(lambda x: TextBlob(x).sentiment)

        return np.c_[date.dt.year,
                     date.dt.month,
                     date.dt.day,
                     date.dt.hour,
                     date.dt.minute,
                     date.dt.dayofweek,
                     date.dt.dayofyear,
                     seconds,
                     title.str[0],
                     title.str[1],
                     tags.str[0],
                     tags.str[1],
                     X[['viewCount', 'commentCount']]]

In [146]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer

full_pipeline = Pipeline([
    ('featureEngineer', FeatureEngineer()),
    ('quantileTransform', QuantileTransformer(output_distribution='normal', random_state=0))
])

X_train_prepared = full_pipeline.fit_transform(X_train)



In [147]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.model_selection import cross_validate
result = cross_validate(lr, X_train_prepared, y_train)
result['test_score']

array([ 0.45233486,  0.27848128, -0.08561997,  0.19310565,  0.45544184])

In [149]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train_prepared)
    rmse= np.sqrt(-cross_val_score(model, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

lasso = Lasso(alpha =0.0005, random_state=1)

score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 114818.9672 (33511.3711)

